/external/XNNPACK/src/q8-gemm/ |
D | 4x4c2-sse2.c | 80 const __m128i vxb0 = _mm_sub_epi16(_mm_unpacklo_epi8(vb0, vzero), vb_zero_point); in xnn_q8_gemm_ukernel_4x4c2__sse2() local 83 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_q8_gemm_ukernel_4x4c2__sse2() 85 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_q8_gemm_ukernel_4x4c2__sse2() 87 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_q8_gemm_ukernel_4x4c2__sse2() 89 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_q8_gemm_ukernel_4x4c2__sse2() 146 const __m128i vxb0 = _mm_sub_epi16(_mm_unpacklo_epi8(vb0, vzero), vb_zero_point); in xnn_q8_gemm_ukernel_4x4c2__sse2() local 149 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_q8_gemm_ukernel_4x4c2__sse2() 151 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_q8_gemm_ukernel_4x4c2__sse2() 153 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_q8_gemm_ukernel_4x4c2__sse2() 155 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_q8_gemm_ukernel_4x4c2__sse2()
|
D | 2x2-scalar.c | 56 const int32_t vxb0 = (int32_t) vb0 - vb_zero_point; in xnn_q8_gemm_ukernel_2x2__scalar() local 59 vacc0x0 += va0 * vxb0; in xnn_q8_gemm_ukernel_2x2__scalar() 61 vacc1x0 += va1 * vxb0; in xnn_q8_gemm_ukernel_2x2__scalar()
|
D | 2x4c8-sse2.c | 85 const __m128i vxb0 = _mm_sub_epi16(_mm_unpacklo_epi8(vb0, vzero), vb_zero_point); in xnn_q8_gemm_ukernel_2x4c8__sse2() local 94 vacc00 = _mm_add_epi32(vacc00, _mm_madd_epi16(vxa0, vxb0)); in xnn_q8_gemm_ukernel_2x4c8__sse2() 98 vacc10 = _mm_add_epi32(vacc10, _mm_madd_epi16(vxa1, vxb0)); in xnn_q8_gemm_ukernel_2x4c8__sse2()
|
/external/XNNPACK/src/q8-igemm/ |
D | 4x4c2-sse2.c | 97 const __m128i vxb0 = _mm_sub_epi16(_mm_unpacklo_epi8(vb0, vzero), vb_zero_point); in xnn_q8_igemm_ukernel_4x4c2__sse2() local 98 … _mm_add_epi32(vacc0x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_q8_igemm_ukernel_4x4c2__sse2() 99 … _mm_add_epi32(vacc1x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_q8_igemm_ukernel_4x4c2__sse2() 100 … _mm_add_epi32(vacc2x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_q8_igemm_ukernel_4x4c2__sse2() 101 … _mm_add_epi32(vacc3x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_q8_igemm_ukernel_4x4c2__sse2() 139 const __m128i vxb0 = _mm_sub_epi16(_mm_unpacklo_epi8(vb0, vzero), vb_zero_point); in xnn_q8_igemm_ukernel_4x4c2__sse2() local 142 … _mm_add_epi32(vacc0x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_q8_igemm_ukernel_4x4c2__sse2() 143 … _mm_add_epi32(vacc1x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_q8_igemm_ukernel_4x4c2__sse2() 144 … _mm_add_epi32(vacc2x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_q8_igemm_ukernel_4x4c2__sse2() 145 … _mm_add_epi32(vacc3x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_q8_igemm_ukernel_4x4c2__sse2()
|
D | 2x2-scalar.c | 69 const int32_t vxb0 = (int32_t) vb0 - vb_zero_point; in xnn_q8_igemm_ukernel_2x2__scalar() local 72 vacc0x0 += va0 * vxb0; in xnn_q8_igemm_ukernel_2x2__scalar() 74 vacc1x0 += va1 * vxb0; in xnn_q8_igemm_ukernel_2x2__scalar()
|
/external/XNNPACK/src/q8-vadd/ |
D | neon.c | 40 const int16x8_t vxb0 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(vb01), vb_zero_point)); in xnn_q8_vadd_ukernel__neon() local 58 vacc0_lo = vmlaq_s32(vacc0_lo, vmovl_s16(vget_low_s16(vxb0)), vb_multiplier); in xnn_q8_vadd_ukernel__neon() 62 vacc0_hi = vmlaq_s32(vacc0_hi, vmovl_high_s16(vxb0), vb_multiplier); in xnn_q8_vadd_ukernel__neon() 110 const int16x8_t vxb0 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(vb01), vb_zero_point)); in xnn_q8_vadd_ukernel__neon() local 123 vacc0_lo = vmlaq_s32(vacc0_lo, vmovl_s16(vget_low_s16(vxb0)), vb_multiplier); in xnn_q8_vadd_ukernel__neon() 125 vacc0_hi = vmlaq_s32(vacc0_hi, vmovl_s16(vget_high_s16(vxb0)), vb_multiplier); in xnn_q8_vadd_ukernel__neon()
|