/external/XNNPACK/src/qu8-vadd/ |
D | minmax-sse2.c | 53 …__m128i vacc_hi = _mm_add_epi32(vzero_point_product, _mm_unpackhi_epi16(va_product_lo, va_product_… in xnn_qu8_vadd_minmax_ukernel__sse2() local 56 vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vb_product_lo, vb_product_hi)); in xnn_qu8_vadd_minmax_ukernel__sse2() 62 …_mm_add_epi32(_mm_and_si128(vacc_hi, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_h… in xnn_qu8_vadd_minmax_ukernel__sse2() 65 …vacc_hi = _mm_sub_epi32(_mm_sra_epi32(vacc_hi, vshift), _mm_cmpgt_epi32(vrem_hi, vremainder_thresh… in xnn_qu8_vadd_minmax_ukernel__sse2() 69 const __m128i vacc = _mm_adds_epi16(_mm_packs_epi32(vacc_lo, vacc_hi), vy_zero_point); in xnn_qu8_vadd_minmax_ukernel__sse2() 95 …__m128i vacc_hi = _mm_add_epi32(vzero_point_product, _mm_unpackhi_epi16(va_product_lo, va_product_… in xnn_qu8_vadd_minmax_ukernel__sse2() local 98 vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vb_product_lo, vb_product_hi)); in xnn_qu8_vadd_minmax_ukernel__sse2() 104 …_mm_add_epi32(_mm_and_si128(vacc_hi, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_h… in xnn_qu8_vadd_minmax_ukernel__sse2() 107 …vacc_hi = _mm_sub_epi32(_mm_sra_epi32(vacc_hi, vshift), _mm_cmpgt_epi32(vrem_hi, vremainder_thresh… in xnn_qu8_vadd_minmax_ukernel__sse2() 111 const __m128i vacc = _mm_adds_epi16(_mm_packs_epi32(vacc_lo, vacc_hi), vy_zero_point); in xnn_qu8_vadd_minmax_ukernel__sse2()
|
D | minmax-neon.c | 161 int32x4_t vacc_hi = vmulq_s32(vmovl_high_s16(vxa), va_multiplier); in xnn_qu8_vadd_minmax_ukernel__neon() local 163 int32x4_t vacc_hi = vmulq_s32(vmovl_s16(vget_high_s16(vxa)), va_multiplier); in xnn_qu8_vadd_minmax_ukernel__neon() local 168 vacc_hi = vmlaq_s32(vacc_hi, vmovl_high_s16(vxb), vb_multiplier); in xnn_qu8_vadd_minmax_ukernel__neon() 170 vacc_hi = vmlaq_s32(vacc_hi, vmovl_s16(vget_high_s16(vxb)), vb_multiplier); in xnn_qu8_vadd_minmax_ukernel__neon() 175 vacc_hi = vsraq_n_s32(vacc_hi, vbicq_s32(vacc_hi, vzero_shift_mask), 31); in xnn_qu8_vadd_minmax_ukernel__neon() 178 vacc_hi = vrshlq_s32(vacc_hi, vright_shift); in xnn_qu8_vadd_minmax_ukernel__neon() 182 const int16x8_t vacc = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc_lo), vacc_hi), vy_zero_point); in xnn_qu8_vadd_minmax_ukernel__neon() 184 …const int16x8_t vacc = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi)), vy_zero_… in xnn_qu8_vadd_minmax_ukernel__neon() 204 int32x4_t vacc_hi = vmulq_s32(vmovl_high_s16(vxa), va_multiplier); in xnn_qu8_vadd_minmax_ukernel__neon() local 206 int32x4_t vacc_hi = vmulq_s32(vmovl_s16(vget_high_s16(vxa)), va_multiplier); in xnn_qu8_vadd_minmax_ukernel__neon() local [all …]
|
/external/XNNPACK/src/qu8-dwconv/ |
D | up8x9-minmax-sse2.c | 73 __m128i vacc_hi = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16)); in xnn_qu8_dwconv_minmax_ukernel_up8x9__sse2() local 82 vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vprod0_odd, vprod0_even)); in xnn_qu8_dwconv_minmax_ukernel_up8x9__sse2() 91 vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vprod1_odd, vprod1_even)); in xnn_qu8_dwconv_minmax_ukernel_up8x9__sse2() 100 vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vprod2_odd, vprod2_even)); in xnn_qu8_dwconv_minmax_ukernel_up8x9__sse2() 109 vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vprod3_odd, vprod3_even)); in xnn_qu8_dwconv_minmax_ukernel_up8x9__sse2() 118 vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vprod4_odd, vprod4_even)); in xnn_qu8_dwconv_minmax_ukernel_up8x9__sse2() 127 vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vprod5_odd, vprod5_even)); in xnn_qu8_dwconv_minmax_ukernel_up8x9__sse2() 136 vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vprod6_odd, vprod6_even)); in xnn_qu8_dwconv_minmax_ukernel_up8x9__sse2() 145 vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vprod7_odd, vprod7_even)); in xnn_qu8_dwconv_minmax_ukernel_up8x9__sse2() 154 vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vprod8_odd, vprod8_even)); in xnn_qu8_dwconv_minmax_ukernel_up8x9__sse2() [all …]
|
D | up8x9-minmax-neon.c | 144 int32x4_t vacc_hi = vaddq_s32(vaccX0_hi, vaccX1_hi); in xnn_qu8_dwconv_minmax_ukernel_up8x9__neon() local 147 vacc_hi = vqrdmulhq_s32(vacc_hi, vmultiplier); in xnn_qu8_dwconv_minmax_ukernel_up8x9__neon() 151 vacc_hi = vsraq_n_s32(vacc_hi, vbicq_s32(vacc_hi, vzero_shift_mask), 31); in xnn_qu8_dwconv_minmax_ukernel_up8x9__neon() 154 vacc_hi = vrshlq_s32(vacc_hi, vright_shift); in xnn_qu8_dwconv_minmax_ukernel_up8x9__neon() 157 …const int16x8_t vacc = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc_lo), vacc_hi), voutput_zero_poin… in xnn_qu8_dwconv_minmax_ukernel_up8x9__neon() 159 …const int16x8_t vacc = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi)), voutput_… in xnn_qu8_dwconv_minmax_ukernel_up8x9__neon() 235 int32x4_t vacc_hi = vaddq_s32(vaccX0_hi, vaccX1_hi); in xnn_qu8_dwconv_minmax_ukernel_up8x9__neon() local 238 vacc_hi = vqrdmulhq_s32(vacc_hi, vmultiplier); in xnn_qu8_dwconv_minmax_ukernel_up8x9__neon() 242 vacc_hi = vsraq_n_s32(vacc_hi, vbicq_s32(vacc_hi, vzero_shift_mask), 31); in xnn_qu8_dwconv_minmax_ukernel_up8x9__neon() 245 vacc_hi = vrshlq_s32(vacc_hi, vright_shift); in xnn_qu8_dwconv_minmax_ukernel_up8x9__neon() [all …]
|
/external/XNNPACK/src/qu8-gavgpool/ |
D | 7p7x-minmax-neon-c8.c | 62 const int32x4_t vacc_hi = vaddw_s16(vbias, vget_high_s16(vsum)); in xnn_qu8_gavgpool_minmax_ukernel_7p7x__neon_c8() local 65 vst1q_s32(acc, vacc_hi); acc += 4; in xnn_qu8_gavgpool_minmax_ukernel_7p7x__neon_c8() 87 const int32x4_t vacc_hi = vld1q_s32(acc + 4); in xnn_qu8_gavgpool_minmax_ukernel_7p7x__neon_c8() local 99 vst1q_s32(acc, vaddw_s16(vacc_hi, vget_high_s16(vsum))); acc += 4; in xnn_qu8_gavgpool_minmax_ukernel_7p7x__neon_c8() 149 int32x4_t vacc_hi = vld1q_s32(acc); acc += 4; in xnn_qu8_gavgpool_minmax_ukernel_7p7x__neon_c8() local 160 vacc_hi = vaddw_s16(vacc_hi, vget_high_s16(vsum)); in xnn_qu8_gavgpool_minmax_ukernel_7p7x__neon_c8() 163 const int32x4_t vneg_mask_hi = vreinterpretq_s32_u32(vcltq_s32(vacc_hi, vmovq_n_s32(0))); in xnn_qu8_gavgpool_minmax_ukernel_7p7x__neon_c8() 168 const int64x2_t vproduct45 = vmull_s32(vget_low_s32(vacc_hi), vget_low_s32(vmultiplier)); in xnn_qu8_gavgpool_minmax_ukernel_7p7x__neon_c8() 169 const int64x2_t vproduct67 = vmull_high_s32(vacc_hi, vmultiplier); in xnn_qu8_gavgpool_minmax_ukernel_7p7x__neon_c8() 178 const int64x2_t vproduct45 = vmull_s32(vget_low_s32(vacc_hi), vmultiplier); in xnn_qu8_gavgpool_minmax_ukernel_7p7x__neon_c8() [all …]
|
D | 7x-minmax-neon-c8.c | 84 int32x4_t vacc_hi = vaddw_s16(vbias, vget_high_s16(vsum)); in xnn_qu8_gavgpool_minmax_ukernel_7x__neon_c8() local 87 const int32x4_t vneg_mask_hi = vreinterpretq_s32_u32(vcltq_s32(vacc_hi, vmovq_n_s32(0))); in xnn_qu8_gavgpool_minmax_ukernel_7x__neon_c8() 92 const int64x2_t vproduct45 = vmull_s32(vget_low_s32(vacc_hi), vget_low_s32(vmultiplier)); in xnn_qu8_gavgpool_minmax_ukernel_7x__neon_c8() 93 const int64x2_t vproduct67 = vmull_high_s32(vacc_hi, vmultiplier); in xnn_qu8_gavgpool_minmax_ukernel_7x__neon_c8() 102 const int64x2_t vproduct45 = vmull_s32(vget_low_s32(vacc_hi), vmultiplier); in xnn_qu8_gavgpool_minmax_ukernel_7x__neon_c8() 103 const int64x2_t vproduct67 = vmull_s32(vget_high_s32(vacc_hi), vmultiplier); in xnn_qu8_gavgpool_minmax_ukernel_7x__neon_c8() 118 … vacc_hi = vuzp1q_s32(vreinterpretq_s32_s64(vscaled_acc45), vreinterpretq_s32_s64(vscaled_acc67)); in xnn_qu8_gavgpool_minmax_ukernel_7x__neon_c8() 120 …const int16x8_t vacc = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc_lo), vacc_hi), voutput_zero_poin… in xnn_qu8_gavgpool_minmax_ukernel_7x__neon_c8() 123 vacc_hi = vcombine_s32(vmovn_s64(vscaled_acc45), vmovn_s64(vscaled_acc67)); in xnn_qu8_gavgpool_minmax_ukernel_7x__neon_c8() 125 …const int16x8_t vacc = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi)), voutput_… in xnn_qu8_gavgpool_minmax_ukernel_7x__neon_c8() [all …]
|
D | 7p7x-minmax-sse2-c8.c | 69 const __m128i vacc_hi = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vsum, vzero)); in xnn_qu8_gavgpool_minmax_ukernel_7p7x__sse2_c8() local 72 _mm_store_si128((__m128i*) acc + 1, vacc_hi); in xnn_qu8_gavgpool_minmax_ukernel_7p7x__sse2_c8() 94 __m128i vacc_hi = _mm_load_si128((const __m128i*) acc + 1); in xnn_qu8_gavgpool_minmax_ukernel_7p7x__sse2_c8() local 113 vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero)); in xnn_qu8_gavgpool_minmax_ukernel_7p7x__sse2_c8() 116 _mm_store_si128((__m128i*) acc + 1, vacc_hi); in xnn_qu8_gavgpool_minmax_ukernel_7p7x__sse2_c8() 161 __m128i vacc_hi = _mm_load_si128((const __m128i*) acc + 1); in xnn_qu8_gavgpool_minmax_ukernel_7p7x__sse2_c8() local 181 vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero)); in xnn_qu8_gavgpool_minmax_ukernel_7p7x__sse2_c8() 184 const __m128i vneg_mask_hi = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_hi); in xnn_qu8_gavgpool_minmax_ukernel_7p7x__sse2_c8() 187 const __m128i vabs_hi0123 = _mm_sub_epi32(_mm_xor_si128(vacc_hi, vneg_mask_hi), vneg_mask_hi); in xnn_qu8_gavgpool_minmax_ukernel_7p7x__sse2_c8() 233 __m128i vacc_hi = _mm_load_si128((const __m128i*) acc + 1); in xnn_qu8_gavgpool_minmax_ukernel_7p7x__sse2_c8() local [all …]
|
D | 7x-minmax-sse2-c8.c | 87 __m128i vacc_hi = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vsum, vzero)); in xnn_qu8_gavgpool_minmax_ukernel_7x__sse2_c8() local 90 const __m128i vneg_mask_hi = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_hi); in xnn_qu8_gavgpool_minmax_ukernel_7x__sse2_c8() 93 const __m128i vabs_hi0123 = _mm_sub_epi32(_mm_xor_si128(vacc_hi, vneg_mask_hi), vneg_mask_hi); in xnn_qu8_gavgpool_minmax_ukernel_7x__sse2_c8() 156 __m128i vacc_hi = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vsum, vzero)); in xnn_qu8_gavgpool_minmax_ukernel_7x__sse2_c8() local 159 const __m128i vneg_mask_hi = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_hi); in xnn_qu8_gavgpool_minmax_ukernel_7x__sse2_c8() 162 const __m128i vabs_hi0123 = _mm_sub_epi32(_mm_xor_si128(vacc_hi, vneg_mask_hi), vneg_mask_hi); in xnn_qu8_gavgpool_minmax_ukernel_7x__sse2_c8()
|
/external/XNNPACK/src/qu8-avgpool/ |
D | 9p8x-minmax-neon-c8.c | 115 const int32x4_t vacc_hi = vaddw_s16(vbias, vreinterpret_s16_u16(vget_high_u16(vsum))); in xnn_qu8_avgpool_minmax_ukernel_9p8x__neon_c8() local 118 vst1q_s32(b, vacc_hi); b += 4; in xnn_qu8_avgpool_minmax_ukernel_9p8x__neon_c8() 176 int32x4_t vacc_hi = vld1q_s32(b + 4); in xnn_qu8_avgpool_minmax_ukernel_9p8x__neon_c8() local 188 vacc_hi = vaddw_s16(vacc_hi, vreinterpret_s16_u16(vget_high_u16(vsum))); in xnn_qu8_avgpool_minmax_ukernel_9p8x__neon_c8() 191 vst1q_s32(b, vacc_hi); b += 4; in xnn_qu8_avgpool_minmax_ukernel_9p8x__neon_c8() 271 int32x4_t vacc_hi = vld1q_s32(b); b += 4; in xnn_qu8_avgpool_minmax_ukernel_9p8x__neon_c8() local 283 vacc_hi = vaddw_s16(vacc_hi, vget_high_s16(vsum)); in xnn_qu8_avgpool_minmax_ukernel_9p8x__neon_c8() 286 const int32x4_t vneg_mask_hi = vreinterpretq_s32_u32(vcltq_s32(vacc_hi, vmovq_n_s32(0))); in xnn_qu8_avgpool_minmax_ukernel_9p8x__neon_c8() 291 const int64x2_t vproduct45 = vmull_s32(vget_low_s32(vacc_hi), vget_low_s32(vmultiplier)); in xnn_qu8_avgpool_minmax_ukernel_9p8x__neon_c8() 292 const int64x2_t vproduct67 = vmull_high_s32(vacc_hi, vmultiplier); in xnn_qu8_avgpool_minmax_ukernel_9p8x__neon_c8() [all …]
|
D | 9x-minmax-neon-c8.c | 139 int32x4_t vacc_hi = vaddw_s16(vbias, vreinterpret_s16_u16(vget_high_u16(vsum))); in xnn_qu8_avgpool_minmax_ukernel_9x__neon_c8() local 142 const int32x4_t vneg_mask_hi = vreinterpretq_s32_u32(vcltq_s32(vacc_hi, vmovq_n_s32(0))); in xnn_qu8_avgpool_minmax_ukernel_9x__neon_c8() 147 const int64x2_t vproduct45 = vmull_s32(vget_low_s32(vacc_hi), vget_low_s32(vmultiplier)); in xnn_qu8_avgpool_minmax_ukernel_9x__neon_c8() 148 const int64x2_t vproduct67 = vmull_high_s32(vacc_hi, vmultiplier); in xnn_qu8_avgpool_minmax_ukernel_9x__neon_c8() 157 const int64x2_t vproduct45 = vmull_s32(vget_low_s32(vacc_hi), vmultiplier); in xnn_qu8_avgpool_minmax_ukernel_9x__neon_c8() 158 const int64x2_t vproduct67 = vmull_s32(vget_high_s32(vacc_hi), vmultiplier); in xnn_qu8_avgpool_minmax_ukernel_9x__neon_c8() 173 … vacc_hi = vuzp1q_s32(vreinterpretq_s32_s64(vscaled_acc45), vreinterpretq_s32_s64(vscaled_acc67)); in xnn_qu8_avgpool_minmax_ukernel_9x__neon_c8() 175 …const int16x8_t vacc = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc_lo), vacc_hi), voutput_zero_poin… in xnn_qu8_avgpool_minmax_ukernel_9x__neon_c8() 178 vacc_hi = vcombine_s32(vmovn_s64(vscaled_acc45), vmovn_s64(vscaled_acc67)); in xnn_qu8_avgpool_minmax_ukernel_9x__neon_c8() 180 …const int16x8_t vacc = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi)), voutput_… in xnn_qu8_avgpool_minmax_ukernel_9x__neon_c8() [all …]
|
D | 9p8x-minmax-sse2-c8.c | 119 const __m128i vacc_hi = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vsum, vzero)); in xnn_qu8_avgpool_minmax_ukernel_9p8x__sse2_c8() local 122 _mm_store_si128((__m128i*) b + 1, vacc_hi); in xnn_qu8_avgpool_minmax_ukernel_9p8x__sse2_c8() 181 __m128i vacc_hi = _mm_load_si128((const __m128i*) b + 1); in xnn_qu8_avgpool_minmax_ukernel_9p8x__sse2_c8() local 202 vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero)); in xnn_qu8_avgpool_minmax_ukernel_9p8x__sse2_c8() 205 _mm_store_si128((__m128i*) b + 1, vacc_hi); in xnn_qu8_avgpool_minmax_ukernel_9p8x__sse2_c8() 286 __m128i vacc_hi = _mm_load_si128((const __m128i*) b + 1); in xnn_qu8_avgpool_minmax_ukernel_9p8x__sse2_c8() local 308 vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero)); in xnn_qu8_avgpool_minmax_ukernel_9p8x__sse2_c8() 311 const __m128i vneg_mask_hi = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_hi); in xnn_qu8_avgpool_minmax_ukernel_9p8x__sse2_c8() 314 … const __m128i vabs_hi0123 = _mm_sub_epi32(_mm_xor_si128(vacc_hi, vneg_mask_hi), vneg_mask_hi); in xnn_qu8_avgpool_minmax_ukernel_9p8x__sse2_c8() 362 __m128i vacc_hi = _mm_load_si128((const __m128i*) b + 1); in xnn_qu8_avgpool_minmax_ukernel_9p8x__sse2_c8() local [all …]
|
D | 9x-minmax-sse2-c8.c | 143 const __m128i vacc_hi = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vsum, vzero)); in xnn_qu8_avgpool_minmax_ukernel_9x__sse2_c8() local 146 const __m128i vneg_mask_hi = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_hi); in xnn_qu8_avgpool_minmax_ukernel_9x__sse2_c8() 149 const __m128i vabs_hi0123 = _mm_sub_epi32(_mm_xor_si128(vacc_hi, vneg_mask_hi), vneg_mask_hi); in xnn_qu8_avgpool_minmax_ukernel_9x__sse2_c8() 218 const __m128i vacc_hi = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vsum, vzero)); in xnn_qu8_avgpool_minmax_ukernel_9x__sse2_c8() local 221 const __m128i vneg_mask_hi = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_hi); in xnn_qu8_avgpool_minmax_ukernel_9x__sse2_c8() 224 const __m128i vabs_hi0123 = _mm_sub_epi32(_mm_xor_si128(vacc_hi, vneg_mask_hi), vneg_mask_hi); in xnn_qu8_avgpool_minmax_ukernel_9x__sse2_c8()
|