/external/libvpx/vpx_dsp/x86/ |
D | variance_sse2.c | 26 __m128i vsum = _mm_setzero_si128(); in vpx_get_mb_ss_sse2() local 31 vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v)); in vpx_get_mb_ss_sse2() 35 return add32x4_sse2(vsum); in vpx_get_mb_ss_sse2() 56 static INLINE void variance_final_128_pel_sse2(__m128i vsse, __m128i vsum, in variance_final_128_pel_sse2() argument 61 vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); in variance_final_128_pel_sse2() 62 vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); in variance_final_128_pel_sse2() 63 vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); in variance_final_128_pel_sse2() 64 *sum = (int16_t)_mm_extract_epi16(vsum, 0); in variance_final_128_pel_sse2() 68 static INLINE void variance_final_256_pel_sse2(__m128i vsse, __m128i vsum, in variance_final_256_pel_sse2() argument 73 vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); in variance_final_256_pel_sse2() [all …]
|
D | variance_avx2.c | 62 __m128i vsum, in variance_final_from_32bit_sum_avx2() argument 70 const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, vsum); in variance_final_from_32bit_sum_avx2() 71 const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, vsum); in variance_final_from_32bit_sum_avx2() 81 __m256i vsum, in variance_final_from_16bit_sum_avx2() argument 85 const __m128i sum_reg_128 = _mm_add_epi16(_mm256_castsi256_si128(vsum), in variance_final_from_16bit_sum_avx2() 86 _mm256_extractf128_si256(vsum, 1)); in variance_final_from_16bit_sum_avx2() 160 __m256i *const vsum) { in variance8_avx2() argument 162 *vsum = _mm256_setzero_si256(); in variance8_avx2() 166 variance8_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum); in variance8_avx2() 175 __m256i *const vsum) { in variance16_avx2() argument [all …]
|
/external/libaom/aom_dsp/x86/ |
D | variance_sse2.c | 25 __m128i vsum = _mm_setzero_si128(); in aom_get_mb_ss_sse2() local 30 vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v)); in aom_get_mb_ss_sse2() 34 vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); in aom_get_mb_ss_sse2() 35 vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); in aom_get_mb_ss_sse2() 36 return (unsigned int)_mm_cvtsi128_si32(vsum); in aom_get_mb_ss_sse2() 82 static inline void variance_final_128_pel_sse2(__m128i vsse, __m128i vsum, in variance_final_128_pel_sse2() argument 87 vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); in variance_final_128_pel_sse2() 88 vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); in variance_final_128_pel_sse2() 89 vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); in variance_final_128_pel_sse2() 90 *sum = (int16_t)_mm_extract_epi16(vsum, 0); in variance_final_128_pel_sse2() [all …]
|
D | variance_avx2.c | 49 static inline int variance_final_from_32bit_sum_avx2(__m256i vsse, __m128i vsum, in variance_final_from_32bit_sum_avx2() argument 55 const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, vsum); in variance_final_from_32bit_sum_avx2() 56 const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, vsum); in variance_final_from_32bit_sum_avx2() 66 static inline int variance_final_512_avx2(__m256i vsse, __m256i vsum, in variance_final_512_avx2() argument 69 const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum); in variance_final_512_avx2() 76 static inline int variance_final_1024_avx2(__m256i vsse, __m256i vsum, in variance_final_1024_avx2() argument 79 const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum); in variance_final_1024_avx2() 94 static inline int variance_final_2048_avx2(__m256i vsse, __m256i vsum, in variance_final_2048_avx2() argument 96 vsum = sum_to_32bit_avx2(vsum); in variance_final_2048_avx2() 97 const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum); in variance_final_2048_avx2() [all …]
|
/external/XNNPACK/src/f32-gavgpool-cw/ |
D | wasmsimd-x86-x4.c | 78 …const v128_t vsum = wasm_f32x4_add(wasm_v32x4_shuffle(vsum01, vsum23, 0, 2, 4, 6), wasm_v32x4_shuf… in xnn_f32_gavgpool_cw_ukernel__wasmsimd_x86_x4() local 79 v128_t vout = wasm_f32x4_mul(vsum, vmultiplier); in xnn_f32_gavgpool_cw_ukernel__wasmsimd_x86_x4() 94 v128_t vsum = wasm_f32x4_const_splat(0.0f); in xnn_f32_gavgpool_cw_ukernel__wasmsimd_x86_x4() local 99 vsum = wasm_f32x4_add(vsum, vi0); in xnn_f32_gavgpool_cw_ukernel__wasmsimd_x86_x4() 106 vsum = wasm_f32x4_add(vsum, vi0); in xnn_f32_gavgpool_cw_ukernel__wasmsimd_x86_x4() 109 …vsum = wasm_f32x4_add(wasm_v32x4_shuffle(vsum, vsum, 0, 2, 4, 6), wasm_v32x4_shuffle(vsum, vsum, 1… in xnn_f32_gavgpool_cw_ukernel__wasmsimd_x86_x4() 110 …vsum = wasm_f32x4_add(wasm_v32x4_shuffle(vsum, vsum, 0, 2, 4, 6), wasm_v32x4_shuffle(vsum, vsum, 1… in xnn_f32_gavgpool_cw_ukernel__wasmsimd_x86_x4() 112 v128_t vout = wasm_f32x4_mul(vsum, vmultiplier); in xnn_f32_gavgpool_cw_ukernel__wasmsimd_x86_x4()
|
D | wasmsimd-arm-x4.c | 78 …const v128_t vsum = wasm_f32x4_add(wasm_v32x4_shuffle(vsum01, vsum23, 0, 2, 4, 6), wasm_v32x4_shuf… in xnn_f32_gavgpool_cw_ukernel__wasmsimd_arm_x4() local 79 v128_t vout = wasm_f32x4_mul(vsum, vmultiplier); in xnn_f32_gavgpool_cw_ukernel__wasmsimd_arm_x4() 94 v128_t vsum = wasm_f32x4_const_splat(0.0f); in xnn_f32_gavgpool_cw_ukernel__wasmsimd_arm_x4() local 99 vsum = wasm_f32x4_add(vsum, vi0); in xnn_f32_gavgpool_cw_ukernel__wasmsimd_arm_x4() 106 vsum = wasm_f32x4_add(vsum, vi0); in xnn_f32_gavgpool_cw_ukernel__wasmsimd_arm_x4() 109 …vsum = wasm_f32x4_add(wasm_v32x4_shuffle(vsum, vsum, 0, 2, 4, 6), wasm_v32x4_shuffle(vsum, vsum, 1… in xnn_f32_gavgpool_cw_ukernel__wasmsimd_arm_x4() 110 …vsum = wasm_f32x4_add(wasm_v32x4_shuffle(vsum, vsum, 0, 2, 4, 6), wasm_v32x4_shuffle(vsum, vsum, 1… in xnn_f32_gavgpool_cw_ukernel__wasmsimd_arm_x4() 112 v128_t vout = wasm_f32x4_mul(vsum, vmultiplier); in xnn_f32_gavgpool_cw_ukernel__wasmsimd_arm_x4()
|
D | sse-x4.c | 78 const __m128 vsum = _mm_add_ps(_mm_movelh_ps(vsum01, vsum23), _mm_movehl_ps(vsum23, vsum01)); in xnn_f32_gavgpool_cw_ukernel__sse_x4() local 79 __m128 vout = _mm_mul_ps(vsum, vmultiplier); in xnn_f32_gavgpool_cw_ukernel__sse_x4() 94 __m128 vsum = _mm_setzero_ps(); in xnn_f32_gavgpool_cw_ukernel__sse_x4() local 99 vsum = _mm_add_ps(vsum, vi0); in xnn_f32_gavgpool_cw_ukernel__sse_x4() 106 vsum = _mm_add_ps(vsum, vi0); in xnn_f32_gavgpool_cw_ukernel__sse_x4() 109 vsum = _mm_add_ps(vsum, _mm_movehl_ps(vsum, vsum)); in xnn_f32_gavgpool_cw_ukernel__sse_x4() 110 vsum = _mm_add_ss(vsum, _mm_shuffle_ps(vsum, vsum, _MM_SHUFFLE(3, 2, 1, 1))); in xnn_f32_gavgpool_cw_ukernel__sse_x4() 112 __m128 vout = _mm_mul_ss(vsum, vmultiplier); in xnn_f32_gavgpool_cw_ukernel__sse_x4()
|
D | neon-x4.c | 76 const float32x4_t vsum = vpaddq_f32(vsum01, vsum23); in xnn_f32_gavgpool_cw_ukernel__neon_x4() local 82 const float32x4_t vsum = vcombine_f32(vpadd_f32(vget_low_f32(vsum01), vget_high_f32(vsum01)), in xnn_f32_gavgpool_cw_ukernel__neon_x4() local 86 float32x4_t vout = vmulq_f32(vsum, vmultiplier); in xnn_f32_gavgpool_cw_ukernel__neon_x4() 114 float32x2_t vsum = vadd_f32(vget_low_f32(vsum0), vget_high_f32(vsum0)); in xnn_f32_gavgpool_cw_ukernel__neon_x4() local 115 vsum = vpadd_f32(vsum, vsum); in xnn_f32_gavgpool_cw_ukernel__neon_x4() 117 float32x2_t vout = vmul_f32(vsum, vget_low_f32(vmultiplier)); in xnn_f32_gavgpool_cw_ukernel__neon_x4()
|
/external/cronet/tot/third_party/llvm-libc/src/AOR_v20.02/networking/arm/ |
D | chksum_simd.c | 22 uint64x1_t vsum = { 0 }; in __chksum_arm_simd() local 45 vsum = vpaddl_u32(vtmp); in __chksum_arm_simd() 98 vsum = vpadal_u32(vsum, vtmp); in __chksum_arm_simd() 106 vsum = vpadal_u32(vsum, vtmp); in __chksum_arm_simd() 126 vsum = vpadal_u32(vsum, vreinterpret_u32_u64(vword64)); in __chksum_arm_simd() 130 vsum = vpaddl_u32(vreinterpret_u32_u64(vsum)); in __chksum_arm_simd() 131 vsum = vpaddl_u32(vreinterpret_u32_u64(vsum)); in __chksum_arm_simd() 132 Assert(vget_lane_u32(vreinterpret_u32_u64(vsum), 1) == 0); in __chksum_arm_simd() 135 uint32x2_t vsum32 = vreinterpret_u32_u64(vsum); in __chksum_arm_simd()
|
/external/cronet/stable/third_party/llvm-libc/src/AOR_v20.02/networking/arm/ |
D | chksum_simd.c | 22 uint64x1_t vsum = { 0 }; in __chksum_arm_simd() local 45 vsum = vpaddl_u32(vtmp); in __chksum_arm_simd() 98 vsum = vpadal_u32(vsum, vtmp); in __chksum_arm_simd() 106 vsum = vpadal_u32(vsum, vtmp); in __chksum_arm_simd() 126 vsum = vpadal_u32(vsum, vreinterpret_u32_u64(vword64)); in __chksum_arm_simd() 130 vsum = vpaddl_u32(vreinterpret_u32_u64(vsum)); in __chksum_arm_simd() 131 vsum = vpaddl_u32(vreinterpret_u32_u64(vsum)); in __chksum_arm_simd() 132 Assert(vget_lane_u32(vreinterpret_u32_u64(vsum), 1) == 0); in __chksum_arm_simd() 135 uint32x2_t vsum32 = vreinterpret_u32_u64(vsum); in __chksum_arm_simd()
|
/external/arm-optimized-routines/networking/arm/ |
D | chksum_simd.c | 21 uint64x1_t vsum = { 0 }; in __chksum_arm_simd() local 44 vsum = vpaddl_u32(vtmp); in __chksum_arm_simd() 97 vsum = vpadal_u32(vsum, vtmp); in __chksum_arm_simd() 105 vsum = vpadal_u32(vsum, vtmp); in __chksum_arm_simd() 125 vsum = vpadal_u32(vsum, vreinterpret_u32_u64(vword64)); in __chksum_arm_simd() 129 vsum = vpaddl_u32(vreinterpret_u32_u64(vsum)); in __chksum_arm_simd() 130 vsum = vpaddl_u32(vreinterpret_u32_u64(vsum)); in __chksum_arm_simd() 131 Assert(vget_lane_u32(vreinterpret_u32_u64(vsum), 1) == 0); in __chksum_arm_simd() 134 uint32x2_t vsum32 = vreinterpret_u32_u64(vsum); in __chksum_arm_simd()
|
/external/libaom/av1/encoder/x86/ |
D | highbd_temporal_filter_avx2.c | 125 static AOM_FORCE_INLINE int32_t xx_mask_and_hadd(__m256i vsum, int i) { in xx_mask_and_hadd() argument 127 __m256i vtmp = _mm256_and_si256(vsum, *(__m256i *)sse_bytemask[i]); in xx_mask_and_hadd() 185 __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]); in highbd_apply_temporal_filter() local 194 acc_5x5_sse[row][0] = xx_mask_and_hadd(vsum, 0); in highbd_apply_temporal_filter() 195 acc_5x5_sse[row][1] = xx_mask_and_hadd(vsum, 1); in highbd_apply_temporal_filter() 196 acc_5x5_sse[row][2] = xx_mask_and_hadd(vsum, 2); in highbd_apply_temporal_filter() 197 acc_5x5_sse[row][3] = xx_mask_and_hadd(vsum, 3); in highbd_apply_temporal_filter() 203 __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]); in highbd_apply_temporal_filter() local 209 acc_5x5_sse[row][0] = xx_mask_and_hadd(vsum, 0); in highbd_apply_temporal_filter() 210 acc_5x5_sse[row][1] = xx_mask_and_hadd(vsum, 1); in highbd_apply_temporal_filter() [all …]
|
/external/pytorch/aten/src/ATen/native/quantized/cpu/qnnpack/src/u8lut32norm/ |
D | scalar.c | 21 uint32_t vsum = 0; in compute_sum() local 24 vsum += t[vx]; in compute_sum() 26 return vsum; in compute_sum() 36 const uint32_t vsum = compute_sum(n, x, t); in pytorch_u8lut32norm_ukernel__scalar() local 37 assert(vsum != 0); in pytorch_u8lut32norm_ukernel__scalar() 39 struct fxdiv_divisor_uint32_t vsum_divisor = fxdiv_init_uint32_t(vsum); in pytorch_u8lut32norm_ukernel__scalar() 40 const uint32_t vrounding = (vsum >> 1); in pytorch_u8lut32norm_ukernel__scalar()
|
/external/XNNPACK/src/u8-lut32norm/ |
D | scalar.c | 23 uint32_t vsum = 0; in compute_sum() local 26 vsum += t[vx]; in compute_sum() 28 return vsum; in compute_sum() 39 const uint32_t vsum = compute_sum(n, x, t); in xnn_u8_lut32norm_ukernel__scalar() local 40 assert(vsum != 0); in xnn_u8_lut32norm_ukernel__scalar() 42 struct fxdiv_divisor_uint32_t vsum_divisor = fxdiv_init_uint32_t(vsum); in xnn_u8_lut32norm_ukernel__scalar() 43 const uint32_t vrounding = (vsum >> 1); in xnn_u8_lut32norm_ukernel__scalar()
|
/external/XNNPACK/src/qs8-gavgpool/ |
D | multipass-neon.c.in | 80 ${XINT16X8_T} vsum${ABC[C:C+8]} = ${VADDL_X8}(vi0x${ABC[C:C+8]}, vi1x${ABC[C:C+8]}); 86 vsum${ABC[C:C+8]} = ${VADDW_X8}(vsum${ABC[C:C+8]}, vi${M}x${ABC[C:C+8]}); 90 const int32x4_t vacc${ABC[C:C+4]} = vaddw_s16(vinit_bias, vget_low_s16(vsum${ABC[C:C+8]})); 91 … const int32x4_t vacc${ABC[C+4:C+8]} = vaddw_s16(vinit_bias, vget_high_s16(vsum${ABC[C:C+8]})); 93 …einterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum${ABC[C:C+8]}))); 94 …interpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum${ABC[C:C+8]}))); 104 ${XINT16X8_T} vsum${ABC[0:8]} = ${VADDL_X8}(vi0x${ABC[0:8]}, vi1x${ABC[0:8]}); 109 vsum${ABC[0:8]} = ${VADDW_X8}(vsum${ABC[0:8]}, vi${M}x${ABC[0:8]}); 112 const int32x4_t vacc${ABC[0:4]} = vaddw_s16(vinit_bias, vget_low_s16(vsum${ABC[0:8]})); 113 const int32x4_t vacc${ABC[4:8]} = vaddw_s16(vinit_bias, vget_high_s16(vsum${ABC[0:8]})); [all …]
|
/external/XNNPACK/src/f16-gavgpool-cw/ |
D | neonfp16arith-x8.c | 79 const float16x4_t vsum = vpadd_f16(vget_low_f16(vsum0123), vget_high_f16(vsum0123)); in xnn_f16_gavgpool_cw_ukernel__neonfp16arith_x8() local 87 const float16x4_t vsum = vpadd_f16(vsum01_lo, vsum23_lo); in xnn_f16_gavgpool_cw_ukernel__neonfp16arith_x8() local 90 float16x4_t vout = vmul_f16(vsum, vmultiplier); in xnn_f16_gavgpool_cw_ukernel__neonfp16arith_x8() 122 float16x4_t vsum = vadd_f16(vget_low_f16(vsum0), vget_high_f16(vsum0)); in xnn_f16_gavgpool_cw_ukernel__neonfp16arith_x8() local 123 vsum = vpadd_f16(vsum, vsum); in xnn_f16_gavgpool_cw_ukernel__neonfp16arith_x8() 124 vsum = vpadd_f16(vsum, vsum); in xnn_f16_gavgpool_cw_ukernel__neonfp16arith_x8() 126 float16x4_t vout = vmul_f16(vsum, vmultiplier); in xnn_f16_gavgpool_cw_ukernel__neonfp16arith_x8()
|
/external/libmpeg2/common/x86/ |
D | icv_variance_ssse3.c | 94 __m128i vsum, vsum_sqr; in icv_variance_8x4_ssse3() local 128 vsum = _mm_add_epi64(sum_r0, sum_r1); in icv_variance_8x4_ssse3() 129 vsum = _mm_add_epi64(vsum, _mm_srli_si128(vsum, 8)); in icv_variance_8x4_ssse3() 131 sum = _mm_cvtsi128_si32(vsum); in icv_variance_8x4_ssse3()
|
/external/pytorch/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8avgpool/ |
D | mp8x9p8q-neon.c | 88 const uint16x8_t vsum = vaddq_u16(vsum2345, vsum01678); in pytorch_q8avgpool_ukernel_mp8x9p8q__neon() local 91 vaddw_s16(vbias, vreinterpret_s16_u16(vget_low_u16(vsum))); in pytorch_q8avgpool_ukernel_mp8x9p8q__neon() 93 vaddw_s16(vbias, vreinterpret_s16_u16(vget_high_u16(vsum))); in pytorch_q8avgpool_ukernel_mp8x9p8q__neon() 141 const uint16x8_t vsum = vaddq_u16(vsum2345, vsum01678); in pytorch_q8avgpool_ukernel_mp8x9p8q__neon() local 144 vaddw_s16(vbias, vreinterpret_s16_u16(vget_low_u16(vsum))); in pytorch_q8avgpool_ukernel_mp8x9p8q__neon() 146 vaddw_s16(vbias, vreinterpret_s16_u16(vget_high_u16(vsum))); in pytorch_q8avgpool_ukernel_mp8x9p8q__neon() 194 const uint16x8_t vsum = vaddq_u16(vsum0123, vsum4567); in pytorch_q8avgpool_ukernel_mp8x9p8q__neon() local 196 vacc_lo = vaddw_s16(vacc_lo, vreinterpret_s16_u16(vget_low_u16(vsum))); in pytorch_q8avgpool_ukernel_mp8x9p8q__neon() 197 vacc_hi = vaddw_s16(vacc_hi, vreinterpret_s16_u16(vget_high_u16(vsum))); in pytorch_q8avgpool_ukernel_mp8x9p8q__neon() 244 const uint16x8_t vsum = vaddq_u16(vsum0123, vsum4567); in pytorch_q8avgpool_ukernel_mp8x9p8q__neon() local [all …]
|
D | mp8x9p8q-sse2.c | 87 const __m128i vsum = _mm_add_epi16(vsum2345, vsum01678); in pytorch_q8avgpool_ukernel_mp8x9p8q__sse2() local 90 _mm_add_epi32(vbias, _mm_unpacklo_epi16(vsum, vzero)); in pytorch_q8avgpool_ukernel_mp8x9p8q__sse2() 92 _mm_add_epi32(vbias, _mm_unpackhi_epi16(vsum, vzero)); in pytorch_q8avgpool_ukernel_mp8x9p8q__sse2() 149 const __m128i vsum = _mm_add_epi16(vsum2345, vsum01678); in pytorch_q8avgpool_ukernel_mp8x9p8q__sse2() local 152 _mm_add_epi32(vbias, _mm_unpacklo_epi16(vsum, vzero)); in pytorch_q8avgpool_ukernel_mp8x9p8q__sse2() 154 _mm_add_epi32(vbias, _mm_unpackhi_epi16(vsum, vzero)); in pytorch_q8avgpool_ukernel_mp8x9p8q__sse2() 210 const __m128i vsum = _mm_add_epi16(vsum0123, vsum4567); in pytorch_q8avgpool_ukernel_mp8x9p8q__sse2() local 212 vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero)); in pytorch_q8avgpool_ukernel_mp8x9p8q__sse2() 213 vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero)); in pytorch_q8avgpool_ukernel_mp8x9p8q__sse2() 268 const __m128i vsum = _mm_add_epi16(vsum0123, vsum4567); in pytorch_q8avgpool_ukernel_mp8x9p8q__sse2() local [all …]
|
/external/XNNPACK/src/f32-gavgpool/ |
D | 7p7x-minmax-scalar-c1.c | 52 const float vsum = vsum016 + vsum2345; in xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1() local 54 *b++ = vsum; in xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1() 86 const float vsum = vsum0123 + vsum456a; in xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1() local 88 *b++ = vsum; in xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1() 140 const float vsum = vsum0123 + vsum456a; in xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1() local 142 float vout = vsum * vscale; in xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1()
|
D | 7p7x-minmax-wasm-c1.c | 52 const float vsum = vsum016 + vsum2345; in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1() local 54 *b++ = vsum; in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1() 86 const float vsum = vsum0123 + vsum456a; in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1() local 88 *b++ = vsum; in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1() 140 const float vsum = vsum0123 + vsum456a; in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1() local 142 float vout = vsum * vscale; in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1()
|
D | 7p7x-minmax-wasmsimd-arm-c4.c | 61 const v128_t vsum = wasm_f32x4_add(vsum016, vsum2345); in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4() local 63 wasm_v128_store(b, vsum); in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4() 102 const v128_t vsum = wasm_f32x4_add(vsum0123, vsum456a); in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4() local 104 wasm_v128_store(b, vsum); b += 4; in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4() 164 const v128_t vsum = wasm_f32x4_add(vsum0123, vsum456a); in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4() local 166 v128_t vout = wasm_f32x4_mul(vsum, vscale); in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4() 193 const v128_t vsum = wasm_f32x4_add(vsum0123, vsum456a); in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4() local 195 v128_t vout = wasm_f32x4_mul(vsum, vscale); in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4()
|
D | 7p7x-minmax-wasmsimd-x86-c4.c | 61 const v128_t vsum = wasm_f32x4_add(vsum016, vsum2345); in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4() local 63 wasm_v128_store(b, vsum); in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4() 102 const v128_t vsum = wasm_f32x4_add(vsum0123, vsum456a); in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4() local 104 wasm_v128_store(b, vsum); b += 4; in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4() 164 const v128_t vsum = wasm_f32x4_add(vsum0123, vsum456a); in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4() local 166 v128_t vout = wasm_f32x4_mul(vsum, vscale); in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4() 193 const v128_t vsum = wasm_f32x4_add(vsum0123, vsum456a); in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4() local 195 v128_t vout = wasm_f32x4_mul(vsum, vscale); in xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4()
|
D | 7p7x-minmax-sse-c4.c | 61 const __m128 vsum = _mm_add_ps(vsum016, vsum2345); in xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4() local 63 _mm_store_ps(b, vsum); b += 4; in xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4() 101 const __m128 vsum = _mm_add_ps(vsum0123, vsum456a); in xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4() local 103 _mm_store_ps(b, vsum); b += 4; in xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4() 163 const __m128 vsum = _mm_add_ps(vsum0123, vsum456a); in xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4() local 165 __m128 vout = _mm_mul_ps(vsum, vscale); in xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4() 192 const __m128 vsum = _mm_add_ps(vsum0123, vsum456a); in xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4() local 194 __m128 vout = _mm_mul_ps(vsum, vscale); in xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4()
|
D | 7p7x-minmax-neon-c4.c | 54 const float32x4_t vsum = vaddq_f32(vsum016, vsum2345); in xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4() local 56 vst1q_f32(b, vsum); b += 4; in xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4() 87 const float32x4_t vsum = vaddq_f32(vsum0123, vsum456a); in xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4() local 89 vst1q_f32(b, vsum); b += 4; in xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4() 141 const float32x4_t vsum = vaddq_f32(vsum0123, vsum456a); in xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4() local 143 float32x4_t vout = vmulq_f32(vsum, vscale); in xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4() 169 const float32x4_t vsum = vaddq_f32(vsum0123, vsum456a); in xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4() local 171 float32x4_t vout = vmulq_f32(vsum, vscale); in xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4()
|