/external/XNNPACK/src/qu8-requantization/ |
D | precise-neon.c | 40 const int32x4_t vmultiplier = vdupq_n_s32(multiplier); in xnn_qu8_requantize_precise__neon() local 42 const int32x2_t vmultiplier = vdup_n_s32(multiplier); in xnn_qu8_requantize_precise__neon() local 61 const int64x2_t x01_product = vmull_s32(vget_low_s32(x), vget_low_s32(vmultiplier)); in xnn_qu8_requantize_precise__neon() 62 const int64x2_t x23_product = vmull_high_s32(x, vmultiplier); in xnn_qu8_requantize_precise__neon() 63 const int64x2_t y01_product = vmull_s32(vget_low_s32(y), vget_low_s32(vmultiplier)); in xnn_qu8_requantize_precise__neon() 64 const int64x2_t y23_product = vmull_high_s32(y, vmultiplier); in xnn_qu8_requantize_precise__neon() 65 const int64x2_t z01_product = vmull_s32(vget_low_s32(z), vget_low_s32(vmultiplier)); in xnn_qu8_requantize_precise__neon() 66 const int64x2_t z23_product = vmull_high_s32(z, vmultiplier); in xnn_qu8_requantize_precise__neon() 67 const int64x2_t w01_product = vmull_s32(vget_low_s32(w), vget_low_s32(vmultiplier)); in xnn_qu8_requantize_precise__neon() 68 const int64x2_t w23_product = vmull_high_s32(w, vmultiplier); in xnn_qu8_requantize_precise__neon() [all …]
|
D | precise-sse4.c | 40 const __m128i vmultiplier = _mm_set1_epi32(multiplier); in xnn_qu8_requantize_precise__sse4() local 64 const __m128i x_absmul02 = _mm_mul_epu32(x_abs0123, vmultiplier); in xnn_qu8_requantize_precise__sse4() 65 const __m128i y_absmul02 = _mm_mul_epu32(y_abs0123, vmultiplier); in xnn_qu8_requantize_precise__sse4() 66 const __m128i z_absmul02 = _mm_mul_epu32(z_abs0123, vmultiplier); in xnn_qu8_requantize_precise__sse4() 67 const __m128i w_absmul02 = _mm_mul_epu32(w_abs0123, vmultiplier); in xnn_qu8_requantize_precise__sse4() 69 const __m128i x_absmul13 = _mm_mul_epu32(x_abs1032, vmultiplier); in xnn_qu8_requantize_precise__sse4() 70 const __m128i y_absmul13 = _mm_mul_epu32(y_abs1032, vmultiplier); in xnn_qu8_requantize_precise__sse4() 71 const __m128i z_absmul13 = _mm_mul_epu32(z_abs1032, vmultiplier); in xnn_qu8_requantize_precise__sse4() 72 const __m128i w_absmul13 = _mm_mul_epu32(w_abs1032, vmultiplier); in xnn_qu8_requantize_precise__sse4()
|
D | precise-ssse3.c | 40 const __m128i vmultiplier = _mm_set1_epi32(multiplier); in xnn_qu8_requantize_precise__ssse3() local 63 const __m128i x_absmul02 = _mm_mul_epu32(x_abs0123, vmultiplier); in xnn_qu8_requantize_precise__ssse3() 64 const __m128i y_absmul02 = _mm_mul_epu32(y_abs0123, vmultiplier); in xnn_qu8_requantize_precise__ssse3() 65 const __m128i z_absmul02 = _mm_mul_epu32(z_abs0123, vmultiplier); in xnn_qu8_requantize_precise__ssse3() 66 const __m128i w_absmul02 = _mm_mul_epu32(w_abs0123, vmultiplier); in xnn_qu8_requantize_precise__ssse3() 68 const __m128i x_absmul13 = _mm_mul_epu32(x_abs1032, vmultiplier); in xnn_qu8_requantize_precise__ssse3() 69 const __m128i y_absmul13 = _mm_mul_epu32(y_abs1032, vmultiplier); in xnn_qu8_requantize_precise__ssse3() 70 const __m128i z_absmul13 = _mm_mul_epu32(z_abs1032, vmultiplier); in xnn_qu8_requantize_precise__ssse3() 71 const __m128i w_absmul13 = _mm_mul_epu32(w_abs1032, vmultiplier); in xnn_qu8_requantize_precise__ssse3()
|
D | q31-sse4.c | 46 const __m128i vmultiplier = _mm_set1_epi32(multiplier); in xnn_qu8_requantize_q31__sse4() local 67 const __m128i x_product_even = _mm_add_epi64(_mm_mul_epi32(x, vmultiplier), vq31rounding); in xnn_qu8_requantize_q31__sse4() 68 const __m128i y_product_even = _mm_add_epi64(_mm_mul_epi32(y, vmultiplier), vq31rounding); in xnn_qu8_requantize_q31__sse4() 69 const __m128i z_product_even = _mm_add_epi64(_mm_mul_epi32(z, vmultiplier), vq31rounding); in xnn_qu8_requantize_q31__sse4() 70 const __m128i w_product_even = _mm_add_epi64(_mm_mul_epi32(w, vmultiplier), vq31rounding); in xnn_qu8_requantize_q31__sse4() 72 const __m128i x_product_odd = _mm_add_epi64(_mm_mul_epi32(x_rev, vmultiplier), vq31rounding); in xnn_qu8_requantize_q31__sse4() 73 const __m128i y_product_odd = _mm_add_epi64(_mm_mul_epi32(y_rev, vmultiplier), vq31rounding); in xnn_qu8_requantize_q31__sse4() 74 const __m128i z_product_odd = _mm_add_epi64(_mm_mul_epi32(z_rev, vmultiplier), vq31rounding); in xnn_qu8_requantize_q31__sse4() 75 const __m128i w_product_odd = _mm_add_epi64(_mm_mul_epi32(w_rev, vmultiplier), vq31rounding); in xnn_qu8_requantize_q31__sse4()
|
D | precise-sse2.c | 40 const __m128i vmultiplier = _mm_set1_epi32(multiplier); in xnn_qu8_requantize_precise__sse2() local 68 const __m128i x_absmul02 = _mm_mul_epu32(x_abs0123, vmultiplier); in xnn_qu8_requantize_precise__sse2() 69 const __m128i y_absmul02 = _mm_mul_epu32(y_abs0123, vmultiplier); in xnn_qu8_requantize_precise__sse2() 70 const __m128i z_absmul02 = _mm_mul_epu32(z_abs0123, vmultiplier); in xnn_qu8_requantize_precise__sse2() 71 const __m128i w_absmul02 = _mm_mul_epu32(w_abs0123, vmultiplier); in xnn_qu8_requantize_precise__sse2() 73 const __m128i x_absmul13 = _mm_mul_epu32(x_abs1032, vmultiplier); in xnn_qu8_requantize_precise__sse2() 74 const __m128i y_absmul13 = _mm_mul_epu32(y_abs1032, vmultiplier); in xnn_qu8_requantize_precise__sse2() 75 const __m128i z_absmul13 = _mm_mul_epu32(z_abs1032, vmultiplier); in xnn_qu8_requantize_precise__sse2() 76 const __m128i w_absmul13 = _mm_mul_epu32(w_abs1032, vmultiplier); in xnn_qu8_requantize_precise__sse2()
|
D | q31-wasmsimd.c | 48 const v128_t vmultiplier = wasm_i64x2_make(twice_multiplier, twice_multiplier); in xnn_qu8_requantize_q31__wasmsimd() local 79 … const v128_t x_product_lo = wasm_i64x2_add(wasm_i64x2_mul(x_lo, vmultiplier), vtwice_q31rounding); in xnn_qu8_requantize_q31__wasmsimd() 80 … const v128_t y_product_lo = wasm_i64x2_add(wasm_i64x2_mul(y_lo, vmultiplier), vtwice_q31rounding); in xnn_qu8_requantize_q31__wasmsimd() 81 … const v128_t z_product_lo = wasm_i64x2_add(wasm_i64x2_mul(z_lo, vmultiplier), vtwice_q31rounding); in xnn_qu8_requantize_q31__wasmsimd() 82 … const v128_t w_product_lo = wasm_i64x2_add(wasm_i64x2_mul(w_lo, vmultiplier), vtwice_q31rounding); in xnn_qu8_requantize_q31__wasmsimd() 84 … const v128_t x_product_hi = wasm_i64x2_add(wasm_i64x2_mul(x_hi, vmultiplier), vtwice_q31rounding); in xnn_qu8_requantize_q31__wasmsimd() 85 … const v128_t y_product_hi = wasm_i64x2_add(wasm_i64x2_mul(y_hi, vmultiplier), vtwice_q31rounding); in xnn_qu8_requantize_q31__wasmsimd() 86 … const v128_t z_product_hi = wasm_i64x2_add(wasm_i64x2_mul(z_hi, vmultiplier), vtwice_q31rounding); in xnn_qu8_requantize_q31__wasmsimd() 87 … const v128_t w_product_hi = wasm_i64x2_add(wasm_i64x2_mul(w_hi, vmultiplier), vtwice_q31rounding); in xnn_qu8_requantize_q31__wasmsimd()
|
/external/XNNPACK/src/qs8-requantization/ |
D | precise-neon.c | 40 const int32x4_t vmultiplier = vdupq_n_s32(multiplier); in xnn_qs8_requantize_precise__neon() local 42 const int32x2_t vmultiplier = vdup_n_s32(multiplier); in xnn_qs8_requantize_precise__neon() local 61 const int64x2_t x01_product = vmull_s32(vget_low_s32(x), vget_low_s32(vmultiplier)); in xnn_qs8_requantize_precise__neon() 62 const int64x2_t x23_product = vmull_high_s32(x, vmultiplier); in xnn_qs8_requantize_precise__neon() 63 const int64x2_t y01_product = vmull_s32(vget_low_s32(y), vget_low_s32(vmultiplier)); in xnn_qs8_requantize_precise__neon() 64 const int64x2_t y23_product = vmull_high_s32(y, vmultiplier); in xnn_qs8_requantize_precise__neon() 65 const int64x2_t z01_product = vmull_s32(vget_low_s32(z), vget_low_s32(vmultiplier)); in xnn_qs8_requantize_precise__neon() 66 const int64x2_t z23_product = vmull_high_s32(z, vmultiplier); in xnn_qs8_requantize_precise__neon() 67 const int64x2_t w01_product = vmull_s32(vget_low_s32(w), vget_low_s32(vmultiplier)); in xnn_qs8_requantize_precise__neon() 68 const int64x2_t w23_product = vmull_high_s32(w, vmultiplier); in xnn_qs8_requantize_precise__neon() [all …]
|
D | precise-sse4.c | 40 const __m128i vmultiplier = _mm_set1_epi32(multiplier); in xnn_qs8_requantize_precise__sse4() local 64 const __m128i x_absmul02 = _mm_mul_epu32(x_abs0123, vmultiplier); in xnn_qs8_requantize_precise__sse4() 65 const __m128i y_absmul02 = _mm_mul_epu32(y_abs0123, vmultiplier); in xnn_qs8_requantize_precise__sse4() 66 const __m128i z_absmul02 = _mm_mul_epu32(z_abs0123, vmultiplier); in xnn_qs8_requantize_precise__sse4() 67 const __m128i w_absmul02 = _mm_mul_epu32(w_abs0123, vmultiplier); in xnn_qs8_requantize_precise__sse4() 69 const __m128i x_absmul13 = _mm_mul_epu32(x_abs1032, vmultiplier); in xnn_qs8_requantize_precise__sse4() 70 const __m128i y_absmul13 = _mm_mul_epu32(y_abs1032, vmultiplier); in xnn_qs8_requantize_precise__sse4() 71 const __m128i z_absmul13 = _mm_mul_epu32(z_abs1032, vmultiplier); in xnn_qs8_requantize_precise__sse4() 72 const __m128i w_absmul13 = _mm_mul_epu32(w_abs1032, vmultiplier); in xnn_qs8_requantize_precise__sse4()
|
D | q31-sse4.c | 46 const __m128i vmultiplier = _mm_set1_epi32(multiplier); in xnn_qs8_requantize_q31__sse4() local 67 const __m128i x_product_even = _mm_add_epi64(_mm_mul_epi32(x, vmultiplier), vq31rounding); in xnn_qs8_requantize_q31__sse4() 68 const __m128i y_product_even = _mm_add_epi64(_mm_mul_epi32(y, vmultiplier), vq31rounding); in xnn_qs8_requantize_q31__sse4() 69 const __m128i z_product_even = _mm_add_epi64(_mm_mul_epi32(z, vmultiplier), vq31rounding); in xnn_qs8_requantize_q31__sse4() 70 const __m128i w_product_even = _mm_add_epi64(_mm_mul_epi32(w, vmultiplier), vq31rounding); in xnn_qs8_requantize_q31__sse4() 72 const __m128i x_product_odd = _mm_add_epi64(_mm_mul_epi32(x_rev, vmultiplier), vq31rounding); in xnn_qs8_requantize_q31__sse4() 73 const __m128i y_product_odd = _mm_add_epi64(_mm_mul_epi32(y_rev, vmultiplier), vq31rounding); in xnn_qs8_requantize_q31__sse4() 74 const __m128i z_product_odd = _mm_add_epi64(_mm_mul_epi32(z_rev, vmultiplier), vq31rounding); in xnn_qs8_requantize_q31__sse4() 75 const __m128i w_product_odd = _mm_add_epi64(_mm_mul_epi32(w_rev, vmultiplier), vq31rounding); in xnn_qs8_requantize_q31__sse4()
|
D | precise-ssse3.c | 40 const __m128i vmultiplier = _mm_set1_epi32(multiplier); in xnn_qs8_requantize_precise__ssse3() local 63 const __m128i x_absmul02 = _mm_mul_epu32(x_abs0123, vmultiplier); in xnn_qs8_requantize_precise__ssse3() 64 const __m128i y_absmul02 = _mm_mul_epu32(y_abs0123, vmultiplier); in xnn_qs8_requantize_precise__ssse3() 65 const __m128i z_absmul02 = _mm_mul_epu32(z_abs0123, vmultiplier); in xnn_qs8_requantize_precise__ssse3() 66 const __m128i w_absmul02 = _mm_mul_epu32(w_abs0123, vmultiplier); in xnn_qs8_requantize_precise__ssse3() 68 const __m128i x_absmul13 = _mm_mul_epu32(x_abs1032, vmultiplier); in xnn_qs8_requantize_precise__ssse3() 69 const __m128i y_absmul13 = _mm_mul_epu32(y_abs1032, vmultiplier); in xnn_qs8_requantize_precise__ssse3() 70 const __m128i z_absmul13 = _mm_mul_epu32(z_abs1032, vmultiplier); in xnn_qs8_requantize_precise__ssse3() 71 const __m128i w_absmul13 = _mm_mul_epu32(w_abs1032, vmultiplier); in xnn_qs8_requantize_precise__ssse3()
|
D | q31-wasmsimd.c | 48 const v128_t vmultiplier = wasm_i64x2_make(twice_multiplier, twice_multiplier); in xnn_qs8_requantize_q31__wasmsimd() local 79 … const v128_t x_product_lo = wasm_i64x2_add(wasm_i64x2_mul(x_lo, vmultiplier), vtwice_q31rounding); in xnn_qs8_requantize_q31__wasmsimd() 80 … const v128_t y_product_lo = wasm_i64x2_add(wasm_i64x2_mul(y_lo, vmultiplier), vtwice_q31rounding); in xnn_qs8_requantize_q31__wasmsimd() 81 … const v128_t z_product_lo = wasm_i64x2_add(wasm_i64x2_mul(z_lo, vmultiplier), vtwice_q31rounding); in xnn_qs8_requantize_q31__wasmsimd() 82 … const v128_t w_product_lo = wasm_i64x2_add(wasm_i64x2_mul(w_lo, vmultiplier), vtwice_q31rounding); in xnn_qs8_requantize_q31__wasmsimd() 84 … const v128_t x_product_hi = wasm_i64x2_add(wasm_i64x2_mul(x_hi, vmultiplier), vtwice_q31rounding); in xnn_qs8_requantize_q31__wasmsimd() 85 … const v128_t y_product_hi = wasm_i64x2_add(wasm_i64x2_mul(y_hi, vmultiplier), vtwice_q31rounding); in xnn_qs8_requantize_q31__wasmsimd() 86 … const v128_t z_product_hi = wasm_i64x2_add(wasm_i64x2_mul(z_hi, vmultiplier), vtwice_q31rounding); in xnn_qs8_requantize_q31__wasmsimd() 87 … const v128_t w_product_hi = wasm_i64x2_add(wasm_i64x2_mul(w_hi, vmultiplier), vtwice_q31rounding); in xnn_qs8_requantize_q31__wasmsimd()
|
D | precise-sse2.c | 40 const __m128i vmultiplier = _mm_set1_epi32(multiplier); in xnn_qs8_requantize_precise__sse2() local 68 const __m128i x_absmul02 = _mm_mul_epu32(x_abs0123, vmultiplier); in xnn_qs8_requantize_precise__sse2() 69 const __m128i y_absmul02 = _mm_mul_epu32(y_abs0123, vmultiplier); in xnn_qs8_requantize_precise__sse2() 70 const __m128i z_absmul02 = _mm_mul_epu32(z_abs0123, vmultiplier); in xnn_qs8_requantize_precise__sse2() 71 const __m128i w_absmul02 = _mm_mul_epu32(w_abs0123, vmultiplier); in xnn_qs8_requantize_precise__sse2() 73 const __m128i x_absmul13 = _mm_mul_epu32(x_abs1032, vmultiplier); in xnn_qs8_requantize_precise__sse2() 74 const __m128i y_absmul13 = _mm_mul_epu32(y_abs1032, vmultiplier); in xnn_qs8_requantize_precise__sse2() 75 const __m128i z_absmul13 = _mm_mul_epu32(z_abs1032, vmultiplier); in xnn_qs8_requantize_precise__sse2() 76 const __m128i w_absmul13 = _mm_mul_epu32(w_abs1032, vmultiplier); in xnn_qs8_requantize_precise__sse2()
|
/external/XNNPACK/src/qs8-gavgpool/gen/ |
D | 7x-minmax-neon-c32-acc2.c | 58 const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->neon.multiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c32_acc2() local 60 const int32x2_t vmultiplier = vld1_dup_s32(¶ms->neon.multiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c32_acc2() local 143 const int64x2_t vprod01 = vmull_s32(vget_low_s32(vacc0123), vget_low_s32(vmultiplier)); in xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c32_acc2() 144 const int64x2_t vprod23 = vmull_high_s32(vacc0123, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c32_acc2() 145 const int64x2_t vprod45 = vmull_s32(vget_low_s32(vacc4567), vget_low_s32(vmultiplier)); in xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c32_acc2() 146 const int64x2_t vprod67 = vmull_high_s32(vacc4567, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c32_acc2() 147 const int64x2_t vprod89 = vmull_s32(vget_low_s32(vacc89AB), vget_low_s32(vmultiplier)); in xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c32_acc2() 148 const int64x2_t vprodAB = vmull_high_s32(vacc89AB, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c32_acc2() 149 const int64x2_t vprodCD = vmull_s32(vget_low_s32(vaccCDEF), vget_low_s32(vmultiplier)); in xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c32_acc2() 150 const int64x2_t vprodEF = vmull_high_s32(vaccCDEF, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c32_acc2() [all …]
|
D | 7x-minmax-neon-c24-acc2.c | 58 const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->neon.multiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c24_acc2() local 60 const int32x2_t vmultiplier = vld1_dup_s32(¶ms->neon.multiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c24_acc2() local 126 const int64x2_t vprod01 = vmull_s32(vget_low_s32(vacc0123), vget_low_s32(vmultiplier)); in xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c24_acc2() 127 const int64x2_t vprod23 = vmull_high_s32(vacc0123, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c24_acc2() 128 const int64x2_t vprod45 = vmull_s32(vget_low_s32(vacc4567), vget_low_s32(vmultiplier)); in xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c24_acc2() 129 const int64x2_t vprod67 = vmull_high_s32(vacc4567, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c24_acc2() 130 const int64x2_t vprod89 = vmull_s32(vget_low_s32(vacc89AB), vget_low_s32(vmultiplier)); in xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c24_acc2() 131 const int64x2_t vprodAB = vmull_high_s32(vacc89AB, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c24_acc2() 132 const int64x2_t vprodCD = vmull_s32(vget_low_s32(vaccCDEF), vget_low_s32(vmultiplier)); in xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c24_acc2() 133 const int64x2_t vprodEF = vmull_high_s32(vaccCDEF, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c24_acc2() [all …]
|
D | 7x-minmax-neon-c16-acc2.c | 58 const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->neon.multiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c16_acc2() local 60 const int32x2_t vmultiplier = vld1_dup_s32(¶ms->neon.multiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c16_acc2() local 109 const int64x2_t vprod01 = vmull_s32(vget_low_s32(vacc0123), vget_low_s32(vmultiplier)); in xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c16_acc2() 110 const int64x2_t vprod23 = vmull_high_s32(vacc0123, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c16_acc2() 111 const int64x2_t vprod45 = vmull_s32(vget_low_s32(vacc4567), vget_low_s32(vmultiplier)); in xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c16_acc2() 112 const int64x2_t vprod67 = vmull_high_s32(vacc4567, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c16_acc2() 113 const int64x2_t vprod89 = vmull_s32(vget_low_s32(vacc89AB), vget_low_s32(vmultiplier)); in xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c16_acc2() 114 const int64x2_t vprodAB = vmull_high_s32(vacc89AB, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c16_acc2() 115 const int64x2_t vprodCD = vmull_s32(vget_low_s32(vaccCDEF), vget_low_s32(vmultiplier)); in xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c16_acc2() 116 const int64x2_t vprodEF = vmull_high_s32(vaccCDEF, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c16_acc2() [all …]
|
D | 7x-minmax-neon-c8-acc2.c | 58 const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->neon.multiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c8_acc2() local 60 const int32x2_t vmultiplier = vld1_dup_s32(¶ms->neon.multiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c8_acc2() local 92 const int64x2_t vprod01 = vmull_s32(vget_low_s32(vacc0123), vget_low_s32(vmultiplier)); in xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c8_acc2() 93 const int64x2_t vprod23 = vmull_high_s32(vacc0123, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c8_acc2() 94 const int64x2_t vprod45 = vmull_s32(vget_low_s32(vacc4567), vget_low_s32(vmultiplier)); in xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c8_acc2() 95 const int64x2_t vprod67 = vmull_high_s32(vacc4567, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c8_acc2() 102 const int64x2_t vprod01 = vmull_s32(vget_low_s32(vacc0123), vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c8_acc2() 103 const int64x2_t vprod23 = vmull_s32(vget_high_s32(vacc0123), vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c8_acc2() 104 const int64x2_t vprod45 = vmull_s32(vget_low_s32(vacc4567), vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c8_acc2() 105 const int64x2_t vprod67 = vmull_s32(vget_high_s32(vacc4567), vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c8_acc2() [all …]
|
D | 7p7x-minmax-neon-c16-acc2.c | 173 const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->neon.multiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c16_acc2() local 175 const int32x2_t vmultiplier = vld1_dup_s32(¶ms->neon.multiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c16_acc2() local 229 const int64x2_t vprod01 = vmull_s32(vget_low_s32(vacc0123), vget_low_s32(vmultiplier)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c16_acc2() 230 const int64x2_t vprod23 = vmull_high_s32(vacc0123, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c16_acc2() 231 const int64x2_t vprod45 = vmull_s32(vget_low_s32(vacc4567), vget_low_s32(vmultiplier)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c16_acc2() 232 const int64x2_t vprod67 = vmull_high_s32(vacc4567, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c16_acc2() 233 const int64x2_t vprod89 = vmull_s32(vget_low_s32(vacc89AB), vget_low_s32(vmultiplier)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c16_acc2() 234 const int64x2_t vprodAB = vmull_high_s32(vacc89AB, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c16_acc2() 235 const int64x2_t vprodCD = vmull_s32(vget_low_s32(vaccCDEF), vget_low_s32(vmultiplier)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c16_acc2() 236 const int64x2_t vprodEF = vmull_high_s32(vaccCDEF, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c16_acc2() [all …]
|
D | 7p7x-minmax-neon-c32-acc2.c | 306 const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->neon.multiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c32_acc2() local 308 const int32x2_t vmultiplier = vld1_dup_s32(¶ms->neon.multiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c32_acc2() local 400 const int64x2_t vprod01 = vmull_s32(vget_low_s32(vacc0123), vget_low_s32(vmultiplier)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c32_acc2() 401 const int64x2_t vprod23 = vmull_high_s32(vacc0123, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c32_acc2() 402 const int64x2_t vprod45 = vmull_s32(vget_low_s32(vacc4567), vget_low_s32(vmultiplier)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c32_acc2() 403 const int64x2_t vprod67 = vmull_high_s32(vacc4567, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c32_acc2() 404 const int64x2_t vprod89 = vmull_s32(vget_low_s32(vacc89AB), vget_low_s32(vmultiplier)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c32_acc2() 405 const int64x2_t vprodAB = vmull_high_s32(vacc89AB, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c32_acc2() 406 const int64x2_t vprodCD = vmull_s32(vget_low_s32(vaccCDEF), vget_low_s32(vmultiplier)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c32_acc2() 407 const int64x2_t vprodEF = vmull_high_s32(vaccCDEF, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c32_acc2() [all …]
|
D | 7p7x-minmax-neon-c8-acc2.c | 137 const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->neon.multiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c8_acc2() local 139 const int32x2_t vmultiplier = vld1_dup_s32(¶ms->neon.multiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c8_acc2() local 174 const int64x2_t vprod01 = vmull_s32(vget_low_s32(vacc0123), vget_low_s32(vmultiplier)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c8_acc2() 175 const int64x2_t vprod23 = vmull_high_s32(vacc0123, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c8_acc2() 176 const int64x2_t vprod45 = vmull_s32(vget_low_s32(vacc4567), vget_low_s32(vmultiplier)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c8_acc2() 177 const int64x2_t vprod67 = vmull_high_s32(vacc4567, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c8_acc2() 184 const int64x2_t vprod01 = vmull_s32(vget_low_s32(vacc0123), vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c8_acc2() 185 const int64x2_t vprod23 = vmull_s32(vget_high_s32(vacc0123), vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c8_acc2() 186 const int64x2_t vprod45 = vmull_s32(vget_low_s32(vacc4567), vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c8_acc2() 187 const int64x2_t vprod67 = vmull_s32(vget_high_s32(vacc4567), vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c8_acc2() [all …]
|
D | 7p7x-minmax-neon-c24-acc2.c | 270 const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->neon.multiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c24_acc2() local 272 const int32x2_t vmultiplier = vld1_dup_s32(¶ms->neon.multiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c24_acc2() local 345 const int64x2_t vprod01 = vmull_s32(vget_low_s32(vacc0123), vget_low_s32(vmultiplier)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c24_acc2() 346 const int64x2_t vprod23 = vmull_high_s32(vacc0123, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c24_acc2() 347 const int64x2_t vprod45 = vmull_s32(vget_low_s32(vacc4567), vget_low_s32(vmultiplier)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c24_acc2() 348 const int64x2_t vprod67 = vmull_high_s32(vacc4567, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c24_acc2() 349 const int64x2_t vprod89 = vmull_s32(vget_low_s32(vacc89AB), vget_low_s32(vmultiplier)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c24_acc2() 350 const int64x2_t vprodAB = vmull_high_s32(vacc89AB, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c24_acc2() 351 const int64x2_t vprodCD = vmull_s32(vget_low_s32(vaccCDEF), vget_low_s32(vmultiplier)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c24_acc2() 352 const int64x2_t vprodEF = vmull_high_s32(vaccCDEF, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c24_acc2() [all …]
|
D | 7x-minmax-sse41-c24-acc2.c | 57 const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse41_c24_acc2() local 134 const __m128i vabsprod02 = _mm_mul_epu32(vabsacc0123, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse41_c24_acc2() 135 const __m128i vabsprod13 = _mm_mul_epu32(vabsacc13, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse41_c24_acc2() 136 const __m128i vabsprod46 = _mm_mul_epu32(vabsacc4567, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse41_c24_acc2() 137 const __m128i vabsprod57 = _mm_mul_epu32(vabsacc57, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse41_c24_acc2() 138 const __m128i vabsprod8A = _mm_mul_epu32(vabsacc89AB, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse41_c24_acc2() 139 const __m128i vabsprod9B = _mm_mul_epu32(vabsacc9B, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse41_c24_acc2() 140 const __m128i vabsprodCE = _mm_mul_epu32(vabsaccCDEF, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse41_c24_acc2() 141 const __m128i vabsprodDF = _mm_mul_epu32(vabsaccDF, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse41_c24_acc2() 142 const __m128i vabsprodGI = _mm_mul_epu32(vabsaccGHIJ, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse41_c24_acc2() [all …]
|
D | 7x-minmax-wasmsimd-c24-acc2.c | 57 const v128_t vmultiplier = wasm_v128_load(params->wasmsimd.multiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__wasmsimd_c24_acc2() local 147 const v128_t vabsprod01 = wasm_i64x2_mul(vabsacc01, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__wasmsimd_c24_acc2() 148 const v128_t vabsprod23 = wasm_i64x2_mul(vabsacc23, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__wasmsimd_c24_acc2() 149 const v128_t vabsprod45 = wasm_i64x2_mul(vabsacc45, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__wasmsimd_c24_acc2() 150 const v128_t vabsprod67 = wasm_i64x2_mul(vabsacc67, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__wasmsimd_c24_acc2() 151 const v128_t vabsprod89 = wasm_i64x2_mul(vabsacc89, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__wasmsimd_c24_acc2() 152 const v128_t vabsprodAB = wasm_i64x2_mul(vabsaccAB, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__wasmsimd_c24_acc2() 153 const v128_t vabsprodCD = wasm_i64x2_mul(vabsaccCD, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__wasmsimd_c24_acc2() 154 const v128_t vabsprodEF = wasm_i64x2_mul(vabsaccEF, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__wasmsimd_c24_acc2() 155 const v128_t vabsprodGH = wasm_i64x2_mul(vabsaccGH, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__wasmsimd_c24_acc2() [all …]
|
/external/XNNPACK/src/qu8-gavgpool/ |
D | 7x-minmax-neon-c8.c | 58 const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->neon.multiplier); in xnn_qu8_gavgpool_minmax_ukernel_7x__neon_c8() local 60 const int32x2_t vmultiplier = vld1_dup_s32(¶ms->neon.multiplier); in xnn_qu8_gavgpool_minmax_ukernel_7x__neon_c8() local 90 const int64x2_t vproduct01 = vmull_s32(vget_low_s32(vacc_lo), vget_low_s32(vmultiplier)); in xnn_qu8_gavgpool_minmax_ukernel_7x__neon_c8() 91 const int64x2_t vproduct23 = vmull_high_s32(vacc_lo, vmultiplier); in xnn_qu8_gavgpool_minmax_ukernel_7x__neon_c8() 92 const int64x2_t vproduct45 = vmull_s32(vget_low_s32(vacc_hi), vget_low_s32(vmultiplier)); in xnn_qu8_gavgpool_minmax_ukernel_7x__neon_c8() 93 const int64x2_t vproduct67 = vmull_high_s32(vacc_hi, vmultiplier); in xnn_qu8_gavgpool_minmax_ukernel_7x__neon_c8() 100 const int64x2_t vproduct01 = vmull_s32(vget_low_s32(vacc_lo), vmultiplier); in xnn_qu8_gavgpool_minmax_ukernel_7x__neon_c8() 101 const int64x2_t vproduct23 = vmull_s32(vget_high_s32(vacc_lo), vmultiplier); in xnn_qu8_gavgpool_minmax_ukernel_7x__neon_c8() 102 const int64x2_t vproduct45 = vmull_s32(vget_low_s32(vacc_hi), vmultiplier); in xnn_qu8_gavgpool_minmax_ukernel_7x__neon_c8() 103 const int64x2_t vproduct67 = vmull_s32(vget_high_s32(vacc_hi), vmultiplier); in xnn_qu8_gavgpool_minmax_ukernel_7x__neon_c8() [all …]
|
D | 7p7x-minmax-neon-c8.c | 104 const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->neon.multiplier); in xnn_qu8_gavgpool_minmax_ukernel_7p7x__neon_c8() local 106 const int32x2_t vmultiplier = vld1_dup_s32(¶ms->neon.multiplier); in xnn_qu8_gavgpool_minmax_ukernel_7p7x__neon_c8() local 166 const int64x2_t vproduct01 = vmull_s32(vget_low_s32(vacc_lo), vget_low_s32(vmultiplier)); in xnn_qu8_gavgpool_minmax_ukernel_7p7x__neon_c8() 167 const int64x2_t vproduct23 = vmull_high_s32(vacc_lo, vmultiplier); in xnn_qu8_gavgpool_minmax_ukernel_7p7x__neon_c8() 168 const int64x2_t vproduct45 = vmull_s32(vget_low_s32(vacc_hi), vget_low_s32(vmultiplier)); in xnn_qu8_gavgpool_minmax_ukernel_7p7x__neon_c8() 169 const int64x2_t vproduct67 = vmull_high_s32(vacc_hi, vmultiplier); in xnn_qu8_gavgpool_minmax_ukernel_7p7x__neon_c8() 176 const int64x2_t vproduct01 = vmull_s32(vget_low_s32(vacc_lo), vmultiplier); in xnn_qu8_gavgpool_minmax_ukernel_7p7x__neon_c8() 177 const int64x2_t vproduct23 = vmull_s32(vget_high_s32(vacc_lo), vmultiplier); in xnn_qu8_gavgpool_minmax_ukernel_7p7x__neon_c8() 178 const int64x2_t vproduct45 = vmull_s32(vget_low_s32(vacc_hi), vmultiplier); in xnn_qu8_gavgpool_minmax_ukernel_7p7x__neon_c8() 179 const int64x2_t vproduct67 = vmull_s32(vget_high_s32(vacc_hi), vmultiplier); in xnn_qu8_gavgpool_minmax_ukernel_7p7x__neon_c8() [all …]
|
/external/XNNPACK/src/qu8-avgpool/ |
D | 9x-minmax-neon-c8.c | 36 const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->neon.multiplier); in xnn_qu8_avgpool_minmax_ukernel_9x__neon_c8() local 38 const int32x2_t vmultiplier = vld1_dup_s32(¶ms->neon.multiplier); in xnn_qu8_avgpool_minmax_ukernel_9x__neon_c8() local 145 const int64x2_t vproduct01 = vmull_s32(vget_low_s32(vacc_lo), vget_low_s32(vmultiplier)); in xnn_qu8_avgpool_minmax_ukernel_9x__neon_c8() 146 const int64x2_t vproduct23 = vmull_high_s32(vacc_lo, vmultiplier); in xnn_qu8_avgpool_minmax_ukernel_9x__neon_c8() 147 const int64x2_t vproduct45 = vmull_s32(vget_low_s32(vacc_hi), vget_low_s32(vmultiplier)); in xnn_qu8_avgpool_minmax_ukernel_9x__neon_c8() 148 const int64x2_t vproduct67 = vmull_high_s32(vacc_hi, vmultiplier); in xnn_qu8_avgpool_minmax_ukernel_9x__neon_c8() 155 const int64x2_t vproduct01 = vmull_s32(vget_low_s32(vacc_lo), vmultiplier); in xnn_qu8_avgpool_minmax_ukernel_9x__neon_c8() 156 const int64x2_t vproduct23 = vmull_s32(vget_high_s32(vacc_lo), vmultiplier); in xnn_qu8_avgpool_minmax_ukernel_9x__neon_c8() 157 const int64x2_t vproduct45 = vmull_s32(vget_low_s32(vacc_hi), vmultiplier); in xnn_qu8_avgpool_minmax_ukernel_9x__neon_c8() 158 const int64x2_t vproduct67 = vmull_s32(vget_high_s32(vacc_hi), vmultiplier); in xnn_qu8_avgpool_minmax_ukernel_9x__neon_c8() [all …]
|