/external/libvpx/libvpx/vpx_dsp/x86/ |
D | variance_sse2.c | 26 __m128i vsum = _mm_setzero_si128(); in vpx_get_mb_ss_sse2() local 31 vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v)); in vpx_get_mb_ss_sse2() 35 return add32x4_sse2(vsum); in vpx_get_mb_ss_sse2() 56 static INLINE void variance_final_128_pel_sse2(__m128i vsse, __m128i vsum, in variance_final_128_pel_sse2() argument 61 vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); in variance_final_128_pel_sse2() 62 vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); in variance_final_128_pel_sse2() 63 vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); in variance_final_128_pel_sse2() 64 *sum = (int16_t)_mm_extract_epi16(vsum, 0); in variance_final_128_pel_sse2() 68 static INLINE void variance_final_256_pel_sse2(__m128i vsse, __m128i vsum, in variance_final_256_pel_sse2() argument 73 vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); in variance_final_256_pel_sse2() [all …]
|
D | variance_avx2.c | 62 __m128i vsum, in variance_final_from_32bit_sum_avx2() argument 70 const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, vsum); in variance_final_from_32bit_sum_avx2() 71 const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, vsum); in variance_final_from_32bit_sum_avx2() 81 __m256i vsum, in variance_final_from_16bit_sum_avx2() argument 85 const __m128i sum_reg_128 = _mm_add_epi16(_mm256_castsi256_si128(vsum), in variance_final_from_16bit_sum_avx2() 86 _mm256_extractf128_si256(vsum, 1)); in variance_final_from_16bit_sum_avx2() 125 __m256i *const vsum) { in variance16_avx2() argument 127 *vsum = _mm256_setzero_si256(); in variance16_avx2() 131 variance16_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum); in variance16_avx2() 140 __m256i *const vsum) { in variance32_avx2() argument [all …]
|
/external/XNNPACK/src/f32-gavgpool-cw/ |
D | wasmsimd-arm-x4.c | 78 …const v128_t vsum = wasm_f32x4_add(wasm_v32x4_shuffle(vsum01, vsum23, 0, 2, 4, 6), wasm_v32x4_shuf… in xnn_f32_gavgpool_cw_ukernel__wasmsimd_arm_x4() local 79 v128_t vout = wasm_f32x4_mul(vsum, vmultiplier); in xnn_f32_gavgpool_cw_ukernel__wasmsimd_arm_x4() 94 v128_t vsum = wasm_f64x2_splat(0.0); in xnn_f32_gavgpool_cw_ukernel__wasmsimd_arm_x4() local 99 vsum = wasm_f32x4_add(vsum, vi0); in xnn_f32_gavgpool_cw_ukernel__wasmsimd_arm_x4() 106 vsum = wasm_f32x4_add(vsum, vi0); in xnn_f32_gavgpool_cw_ukernel__wasmsimd_arm_x4() 109 …vsum = wasm_f32x4_add(wasm_v32x4_shuffle(vsum, vsum, 0, 2, 4, 6), wasm_v32x4_shuffle(vsum, vsum, 1… in xnn_f32_gavgpool_cw_ukernel__wasmsimd_arm_x4() 110 …vsum = wasm_f32x4_add(wasm_v32x4_shuffle(vsum, vsum, 0, 2, 4, 6), wasm_v32x4_shuffle(vsum, vsum, 1… in xnn_f32_gavgpool_cw_ukernel__wasmsimd_arm_x4() 112 v128_t vout = wasm_f32x4_mul(vsum, vmultiplier); in xnn_f32_gavgpool_cw_ukernel__wasmsimd_arm_x4()
|
D | wasmsimd-x86-x4.c | 78 …const v128_t vsum = wasm_f32x4_add(wasm_v32x4_shuffle(vsum01, vsum23, 0, 2, 4, 6), wasm_v32x4_shuf… in xnn_f32_gavgpool_cw_ukernel__wasmsimd_x86_x4() local 79 v128_t vout = wasm_f32x4_mul(vsum, vmultiplier); in xnn_f32_gavgpool_cw_ukernel__wasmsimd_x86_x4() 96 v128_t vsum = wasm_f64x2_splat(0.0); in xnn_f32_gavgpool_cw_ukernel__wasmsimd_x86_x4() local 101 vsum = wasm_f32x4_add(vsum, vi0); in xnn_f32_gavgpool_cw_ukernel__wasmsimd_x86_x4() 108 vsum = wasm_f32x4_add(vsum, vi0); in xnn_f32_gavgpool_cw_ukernel__wasmsimd_x86_x4() 111 …vsum = wasm_f32x4_add(wasm_v32x4_shuffle(vsum, vsum, 0, 2, 4, 6), wasm_v32x4_shuffle(vsum, vsum, 1… in xnn_f32_gavgpool_cw_ukernel__wasmsimd_x86_x4() 112 …vsum = wasm_f32x4_add(wasm_v32x4_shuffle(vsum, vsum, 0, 2, 4, 6), wasm_v32x4_shuffle(vsum, vsum, 1… in xnn_f32_gavgpool_cw_ukernel__wasmsimd_x86_x4() 114 v128_t vout = wasm_f32x4_mul(vsum, vmultiplier); in xnn_f32_gavgpool_cw_ukernel__wasmsimd_x86_x4()
|
D | sse-x4.c | 78 const __m128 vsum = _mm_add_ps(_mm_movelh_ps(vsum01, vsum23), _mm_movehl_ps(vsum23, vsum01)); in xnn_f32_gavgpool_cw_ukernel__sse_x4() local 79 __m128 vout = _mm_mul_ps(vsum, vmultiplier); in xnn_f32_gavgpool_cw_ukernel__sse_x4() 94 __m128 vsum = _mm_setzero_ps(); in xnn_f32_gavgpool_cw_ukernel__sse_x4() local 99 vsum = _mm_add_ps(vsum, vi0); in xnn_f32_gavgpool_cw_ukernel__sse_x4() 106 vsum = _mm_add_ps(vsum, vi0); in xnn_f32_gavgpool_cw_ukernel__sse_x4() 109 vsum = _mm_add_ps(vsum, _mm_movehl_ps(vsum, vsum)); in xnn_f32_gavgpool_cw_ukernel__sse_x4() 110 vsum = _mm_add_ss(vsum, _mm_shuffle_ps(vsum, vsum, _MM_SHUFFLE(3, 2, 1, 1))); in xnn_f32_gavgpool_cw_ukernel__sse_x4() 112 __m128 vout = _mm_mul_ss(vsum, vmultiplier); in xnn_f32_gavgpool_cw_ukernel__sse_x4()
|
D | neon-x4.c | 76 const float32x4_t vsum = vpaddq_f32(vsum01, vsum23); in xnn_f32_gavgpool_cw_ukernel__neon_x4() local 82 const float32x4_t vsum = vcombine_f32(vpadd_f32(vget_low_f32(vsum01), vget_high_f32(vsum01)), in xnn_f32_gavgpool_cw_ukernel__neon_x4() local 86 float32x4_t vout = vmulq_f32(vsum, vmultiplier); in xnn_f32_gavgpool_cw_ukernel__neon_x4() 114 float32x2_t vsum = vadd_f32(vget_low_f32(vsum0), vget_high_f32(vsum0)); in xnn_f32_gavgpool_cw_ukernel__neon_x4() local 115 vsum = vpadd_f32(vsum, vsum); in xnn_f32_gavgpool_cw_ukernel__neon_x4() 117 float32x2_t vout = vmul_f32(vsum, vget_low_f32(vmultiplier)); in xnn_f32_gavgpool_cw_ukernel__neon_x4()
|
/external/llvm-project/llvm/test/MC/VE/ |
D | VSUMX.s | 6 # CHECK-INST: vsum.l %v11, %v12 8 vsum.l %v11, %v12 10 # CHECK-INST: vsum.l %v11, %vix, %vm11 12 vsum.l %v11, %vix, %vm11 14 # CHECK-INST: vsum.l %vix, %v22, %vm15 16 vsum.l %vix, %v22, %vm15 18 # CHECK-INST: vsum.l %v63, %v60, %vm2 20 vsum.l %v63, %v60, %vm2 22 # CHECK-INST: vsum.l %vix, %vix 24 vsum.l %vix, %vix, %vm0 [all …]
|
D | VSUMS.s | 6 # CHECK-INST: vsum.w.sx %v11, %v12 8 vsum.w.sx %v11, %v12 10 # CHECK-INST: vsum.w.sx %v11, %vix, %vm11 12 vsum.w.sx %v11, %vix, %vm11 14 # CHECK-INST: vsum.w.sx %vix, %v22, %vm15 16 vsum.w.sx %vix, %v22, %vm15 18 # CHECK-INST: vsum.w.zx %v63, %v60, %vm2 20 vsum.w.zx %v63, %v60, %vm2 22 # CHECK-INST: vsum.w.zx %vix, %vix 24 vsum.w.zx %vix, %vix, %vm0 [all …]
|
/external/arm-optimized-routines/networking/arm/ |
D | chksum_simd.c | 21 uint64x1_t vsum = { 0 }; in __chksum_arm_simd() local 44 vsum = vpaddl_u32(vtmp); in __chksum_arm_simd() 97 vsum = vpadal_u32(vsum, vtmp); in __chksum_arm_simd() 105 vsum = vpadal_u32(vsum, vtmp); in __chksum_arm_simd() 125 vsum = vpadal_u32(vsum, vreinterpret_u32_u64(vword64)); in __chksum_arm_simd() 129 vsum = vpaddl_u32(vreinterpret_u32_u64(vsum)); in __chksum_arm_simd() 130 vsum = vpaddl_u32(vreinterpret_u32_u64(vsum)); in __chksum_arm_simd() 131 Assert(vget_lane_u32(vreinterpret_u32_u64(vsum), 1) == 0); in __chksum_arm_simd() 134 uint32x2_t vsum32 = vreinterpret_u32_u64(vsum); in __chksum_arm_simd()
|
/external/llvm-project/libc/AOR_v20.02/networking/arm/ |
D | chksum_simd.c | 22 uint64x1_t vsum = { 0 }; in __chksum_arm_simd() local 45 vsum = vpaddl_u32(vtmp); in __chksum_arm_simd() 98 vsum = vpadal_u32(vsum, vtmp); in __chksum_arm_simd() 106 vsum = vpadal_u32(vsum, vtmp); in __chksum_arm_simd() 126 vsum = vpadal_u32(vsum, vreinterpret_u32_u64(vword64)); in __chksum_arm_simd() 130 vsum = vpaddl_u32(vreinterpret_u32_u64(vsum)); in __chksum_arm_simd() 131 vsum = vpaddl_u32(vreinterpret_u32_u64(vsum)); in __chksum_arm_simd() 132 Assert(vget_lane_u32(vreinterpret_u32_u64(vsum), 1) == 0); in __chksum_arm_simd() 135 uint32x2_t vsum32 = vreinterpret_u32_u64(vsum); in __chksum_arm_simd()
|
/external/libaom/libaom/aom_dsp/x86/ |
D | variance_sse2.c | 30 __m128i vsum = _mm_setzero_si128(); in aom_get_mb_ss_sse2() local 35 vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v)); in aom_get_mb_ss_sse2() 39 vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); in aom_get_mb_ss_sse2() 40 vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); in aom_get_mb_ss_sse2() 41 return _mm_cvtsi128_si32(vsum); in aom_get_mb_ss_sse2() 80 static INLINE void variance_final_128_pel_sse2(__m128i vsse, __m128i vsum, in variance_final_128_pel_sse2() argument 85 vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); in variance_final_128_pel_sse2() 86 vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); in variance_final_128_pel_sse2() 87 vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); in variance_final_128_pel_sse2() 88 *sum = (int16_t)_mm_extract_epi16(vsum, 0); in variance_final_128_pel_sse2() [all …]
|
D | variance_avx2.c | 48 static INLINE int variance_final_from_32bit_sum_avx2(__m256i vsse, __m128i vsum, in variance_final_from_32bit_sum_avx2() argument 54 const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, vsum); in variance_final_from_32bit_sum_avx2() 55 const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, vsum); in variance_final_from_32bit_sum_avx2() 65 static INLINE int variance_final_512_avx2(__m256i vsse, __m256i vsum, in variance_final_512_avx2() argument 68 const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum); in variance_final_512_avx2() 75 static INLINE int variance_final_1024_avx2(__m256i vsse, __m256i vsum, in variance_final_1024_avx2() argument 78 const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum); in variance_final_1024_avx2() 93 static INLINE int variance_final_2048_avx2(__m256i vsse, __m256i vsum, in variance_final_2048_avx2() argument 95 vsum = sum_to_32bit_avx2(vsum); in variance_final_2048_avx2() 96 const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum); in variance_final_2048_avx2() [all …]
|
/external/webrtc/modules/video_processing/util/ |
D | denoiser_filter_sse2.cc | 26 __m128i vsum = _mm_setzero_si128(); in Get8x8varSse2() local 42 vsum = _mm_add_epi16(vsum, diff0); in Get8x8varSse2() 43 vsum = _mm_add_epi16(vsum, diff1); in Get8x8varSse2() 49 vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); in Get8x8varSse2() 50 vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); in Get8x8varSse2() 51 vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); in Get8x8varSse2() 52 *sum = static_cast<int16_t>(_mm_extract_epi16(vsum, 0)); in Get8x8varSse2()
|
/external/XNNPACK/src/u8-lut32norm/ |
D | scalar.c | 23 uint32_t vsum = 0; in compute_sum() local 26 vsum += t[vx]; in compute_sum() 28 return vsum; in compute_sum() 39 const uint32_t vsum = compute_sum(n, x, t); in xnn_u8_lut32norm_ukernel__scalar() local 40 assert(vsum != 0); in xnn_u8_lut32norm_ukernel__scalar() 42 struct fxdiv_divisor_uint32_t vsum_divisor = fxdiv_init_uint32_t(vsum); in xnn_u8_lut32norm_ukernel__scalar() 43 const uint32_t vrounding = (vsum >> 1); in xnn_u8_lut32norm_ukernel__scalar()
|
/external/python/pybind11/tests/ |
D | test_stl_binders.py | 213 vsum = 0 216 vsum += v.value 218 assert vsum == 150 225 vsum = 0 228 vsum += v.value 230 assert vsum == 150 249 vsum = 0 253 vsum += v_i.value 255 assert vsum == 7500 263 vsum = 0 [all …]
|
/external/libmpeg2/common/x86/ |
D | icv_variance_ssse3.c | 94 __m128i vsum, vsum_sqr; in icv_variance_8x4_ssse3() local 128 vsum = _mm_add_epi64(sum_r0, sum_r1); in icv_variance_8x4_ssse3() 129 vsum = _mm_add_epi64(vsum, _mm_srli_si128(vsum, 8)); in icv_variance_8x4_ssse3() 131 sum = _mm_cvtsi128_si32(vsum); in icv_variance_8x4_ssse3()
|
/external/XNNPACK/src/f32-gavgpool/ |
D | 7p7x-minmax-scalar-c1.c | 52 const float vsum = vsum016 + vsum2345; in xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1() local 54 *b++ = vsum; in xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1() 86 const float vsum = vsum0123 + vsum456a; in xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1() local 88 *b++ = vsum; in xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1() 140 const float vsum = vsum0123 + vsum456a; in xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1() local 142 float vout = vsum * vscale; in xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1()
|
D | 7p7x-minmax-wasm-c1.c | 52 const float vsum = vsum016 + vsum2345; in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1() local 54 *b++ = vsum; in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1() 86 const float vsum = vsum0123 + vsum456a; in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1() local 88 *b++ = vsum; in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1() 140 const float vsum = vsum0123 + vsum456a; in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1() local 142 float vout = vsum * vscale; in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1()
|
D | 7p7x-minmax-neon-c4.c | 54 const float32x4_t vsum = vaddq_f32(vsum016, vsum2345); in xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4() local 56 vst1q_f32(b, vsum); b += 4; in xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4() 87 const float32x4_t vsum = vaddq_f32(vsum0123, vsum456a); in xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4() local 89 vst1q_f32(b, vsum); b += 4; in xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4() 141 const float32x4_t vsum = vaddq_f32(vsum0123, vsum456a); in xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4() local 143 float32x4_t vout = vmulq_f32(vsum, vscale); in xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4() 169 const float32x4_t vsum = vaddq_f32(vsum0123, vsum456a); in xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4() local 171 float32x4_t vout = vmulq_f32(vsum, vscale); in xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4()
|
D | 7p7x-minmax-wasmsimd-arm-c4.c | 61 const v128_t vsum = wasm_f32x4_add(vsum016, vsum2345); in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4() local 63 wasm_v128_store(b, vsum); in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4() 102 const v128_t vsum = wasm_f32x4_add(vsum0123, vsum456a); in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4() local 104 wasm_v128_store(b, vsum); b += 4; in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4() 164 const v128_t vsum = wasm_f32x4_add(vsum0123, vsum456a); in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4() local 166 v128_t vout = wasm_f32x4_mul(vsum, vscale); in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4() 193 const v128_t vsum = wasm_f32x4_add(vsum0123, vsum456a); in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4() local 195 v128_t vout = wasm_f32x4_mul(vsum, vscale); in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4()
|
D | 7p7x-minmax-sse-c4.c | 61 const __m128 vsum = _mm_add_ps(vsum016, vsum2345); in xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4() local 63 _mm_store_ps(b, vsum); b += 4; in xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4() 101 const __m128 vsum = _mm_add_ps(vsum0123, vsum456a); in xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4() local 103 _mm_store_ps(b, vsum); b += 4; in xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4() 163 const __m128 vsum = _mm_add_ps(vsum0123, vsum456a); in xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4() local 165 __m128 vout = _mm_mul_ps(vsum, vscale); in xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4() 192 const __m128 vsum = _mm_add_ps(vsum0123, vsum456a); in xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4() local 194 __m128 vout = _mm_mul_ps(vsum, vscale); in xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4()
|
D | 7p7x-minmax-wasmsimd-x86-c4.c | 61 const v128_t vsum = wasm_f32x4_add(vsum016, vsum2345); in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4() local 63 wasm_v128_store(b, vsum); in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4() 102 const v128_t vsum = wasm_f32x4_add(vsum0123, vsum456a); in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4() local 104 wasm_v128_store(b, vsum); b += 4; in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4() 164 const v128_t vsum = wasm_f32x4_add(vsum0123, vsum456a); in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4() local 166 v128_t vout = wasm_f32x4_mul(vsum, vscale); in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4() 193 const v128_t vsum = wasm_f32x4_add(vsum0123, vsum456a); in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4() local 195 v128_t vout = wasm_f32x4_mul(vsum, vscale); in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4()
|
/external/XNNPACK/src/f16-gavgpool/ |
D | 7p7x-minmax-neonfp16arith-c8.c | 55 const float16x8_t vsum = vaddq_f16(vsum016, vsum2345); in xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8() local 57 vst1q_f16(b, vsum); b += 8; in xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8() 88 const float16x8_t vsum = vaddq_f16(vsum0123, vsum456a); in xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8() local 90 vst1q_f16(b, vsum); b += 8; in xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8() 142 const float16x8_t vsum = vaddq_f16(vsum0123, vsum456a); in xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8() local 144 float16x8_t vout = vmulq_f16(vsum, vscale); in xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8() 170 const float16x8_t vsum = vaddq_f16(vsum0123, vsum456a); in xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8() local 172 float16x8_t vout = vmulq_f16(vsum, vscale); in xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8()
|
/external/XNNPACK/src/qu8-gavgpool/ |
D | 7p7x-minmax-neon-c8.c | 59 const int16x8_t vsum = vreinterpretq_s16_u16(vaddq_u16(vsum016, vsum2345)); in xnn_qu8_gavgpool_minmax_ukernel_7p7x__neon_c8() local 61 const int32x4_t vacc_lo = vaddw_s16(vbias, vget_low_s16(vsum)); in xnn_qu8_gavgpool_minmax_ukernel_7p7x__neon_c8() 62 const int32x4_t vacc_hi = vaddw_s16(vbias, vget_high_s16(vsum)); in xnn_qu8_gavgpool_minmax_ukernel_7p7x__neon_c8() 96 const int16x8_t vsum = vreinterpretq_s16_u16(vaddq_u16(vsum016, vsum2345)); in xnn_qu8_gavgpool_minmax_ukernel_7p7x__neon_c8() local 98 vst1q_s32(acc, vaddw_s16(vacc_lo, vget_low_s16(vsum))); acc += 4; in xnn_qu8_gavgpool_minmax_ukernel_7p7x__neon_c8() 99 vst1q_s32(acc, vaddw_s16(vacc_hi, vget_high_s16(vsum))); acc += 4; in xnn_qu8_gavgpool_minmax_ukernel_7p7x__neon_c8() 158 const int16x8_t vsum = vreinterpretq_s16_u16(vaddq_u16(vsum016, vsum2345)); in xnn_qu8_gavgpool_minmax_ukernel_7p7x__neon_c8() local 159 vacc_lo = vaddw_s16(vacc_lo, vget_low_s16(vsum)); in xnn_qu8_gavgpool_minmax_ukernel_7p7x__neon_c8() 160 vacc_hi = vaddw_s16(vacc_hi, vget_high_s16(vsum)); in xnn_qu8_gavgpool_minmax_ukernel_7p7x__neon_c8() 230 const int16x8_t vsum = vreinterpretq_s16_u16(vaddq_u16(vsum016, vsum2345)); in xnn_qu8_gavgpool_minmax_ukernel_7p7x__neon_c8() local [all …]
|
D | 7p7x-minmax-scalar-c1.c | 56 const uint32_t vsum = vsum016 + vsum2345; in xnn_qu8_gavgpool_minmax_ukernel_7p7x__scalar_c1() local 57 const int32_t vacc = vbias + (int32_t) vsum; in xnn_qu8_gavgpool_minmax_ukernel_7p7x__scalar_c1() 90 const uint32_t vsum = vsum016 + vsum2345; in xnn_qu8_gavgpool_minmax_ukernel_7p7x__scalar_c1() local 92 *b++ += (int32_t) vsum; in xnn_qu8_gavgpool_minmax_ukernel_7p7x__scalar_c1() 150 const uint32_t vsum = vsum016 + vsum2345; in xnn_qu8_gavgpool_minmax_ukernel_7p7x__scalar_c1() local 151 vacc += (int32_t) vsum; in xnn_qu8_gavgpool_minmax_ukernel_7p7x__scalar_c1()
|