/external/XNNPACK/src/q8-igemm/ |
D | 4x8-neon.c | 91 const int16x8_t vxa1 = vreinterpretq_s16_u16(vmovl_u8(va1)); in xnn_q8_igemm_ukernel_4x8__neon() local 101 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 0); in xnn_q8_igemm_ukernel_4x8__neon() 102 … vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 0); in xnn_q8_igemm_ukernel_4x8__neon() 115 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 1); in xnn_q8_igemm_ukernel_4x8__neon() 116 … vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 1); in xnn_q8_igemm_ukernel_4x8__neon() 129 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 2); in xnn_q8_igemm_ukernel_4x8__neon() 130 … vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 2); in xnn_q8_igemm_ukernel_4x8__neon() 143 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 3); in xnn_q8_igemm_ukernel_4x8__neon() 144 … vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 3); in xnn_q8_igemm_ukernel_4x8__neon() 157 … vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa1), 0); in xnn_q8_igemm_ukernel_4x8__neon() [all …]
|
D | 8x8-neon.c | 135 const int16x8_t vxa1 = vreinterpretq_s16_u16(vmovl_u8(va1)); in xnn_q8_igemm_ukernel_8x8__neon() local 149 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 0); in xnn_q8_igemm_ukernel_8x8__neon() 150 … vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 0); in xnn_q8_igemm_ukernel_8x8__neon() 171 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 1); in xnn_q8_igemm_ukernel_8x8__neon() 172 … vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 1); in xnn_q8_igemm_ukernel_8x8__neon() 193 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 2); in xnn_q8_igemm_ukernel_8x8__neon() 194 … vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 2); in xnn_q8_igemm_ukernel_8x8__neon() 215 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 3); in xnn_q8_igemm_ukernel_8x8__neon() 216 … vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 3); in xnn_q8_igemm_ukernel_8x8__neon() 237 … vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa1), 0); in xnn_q8_igemm_ukernel_8x8__neon() [all …]
|
D | 4x4c2-sse2.c | 87 const __m128i vxa1 = _mm_unpacklo_epi8(va1, vzero); in xnn_q8_igemm_ukernel_4x4c2__sse2() local 99 …vacc1x0123 = _mm_add_epi32(vacc1x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0,… in xnn_q8_igemm_ukernel_4x4c2__sse2() 106 …vacc1x0123 = _mm_add_epi32(vacc1x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1,… in xnn_q8_igemm_ukernel_4x4c2__sse2() 113 …vacc1x0123 = _mm_add_epi32(vacc1x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2,… in xnn_q8_igemm_ukernel_4x4c2__sse2() 120 …vacc1x0123 = _mm_add_epi32(vacc1x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3,… in xnn_q8_igemm_ukernel_4x4c2__sse2() 132 const __m128i vxa1 = _mm_unpacklo_epi8(va1, vzero); in xnn_q8_igemm_ukernel_4x4c2__sse2() local 143 …vacc1x0123 = _mm_add_epi32(vacc1x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0,… in xnn_q8_igemm_ukernel_4x4c2__sse2() 153 …vacc1x0123 = _mm_add_epi32(vacc1x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1,… in xnn_q8_igemm_ukernel_4x4c2__sse2() 163 …vacc1x0123 = _mm_add_epi32(vacc1x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2,… in xnn_q8_igemm_ukernel_4x4c2__sse2() 173 …vacc1x0123 = _mm_add_epi32(vacc1x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3,… in xnn_q8_igemm_ukernel_4x4c2__sse2()
|
/external/XNNPACK/src/q8-gemm/ |
D | 4x8-neon.c | 72 const int16x8_t vxa1 = vreinterpretq_s16_u16(vmovl_u8(va1)); in xnn_q8_gemm_ukernel_4x8__neon() local 83 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0); in xnn_q8_gemm_ukernel_4x8__neon() 84 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0); in xnn_q8_gemm_ukernel_4x8__neon() 95 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1); in xnn_q8_gemm_ukernel_4x8__neon() 96 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1); in xnn_q8_gemm_ukernel_4x8__neon() 107 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2); in xnn_q8_gemm_ukernel_4x8__neon() 108 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2); in xnn_q8_gemm_ukernel_4x8__neon() 119 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3); in xnn_q8_gemm_ukernel_4x8__neon() 120 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3); in xnn_q8_gemm_ukernel_4x8__neon() 131 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0); in xnn_q8_gemm_ukernel_4x8__neon() [all …]
|
D | 8x8-neon.c | 104 const int16x8_t vxa1 = vreinterpretq_s16_u16(vmovl_u8(va1)); a1 += 8; in xnn_q8_gemm_ukernel_8x8__neon() local 123 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0); in xnn_q8_gemm_ukernel_8x8__neon() 124 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0); in xnn_q8_gemm_ukernel_8x8__neon() 143 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1); in xnn_q8_gemm_ukernel_8x8__neon() 144 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1); in xnn_q8_gemm_ukernel_8x8__neon() 163 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2); in xnn_q8_gemm_ukernel_8x8__neon() 164 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2); in xnn_q8_gemm_ukernel_8x8__neon() 183 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3); in xnn_q8_gemm_ukernel_8x8__neon() 184 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3); in xnn_q8_gemm_ukernel_8x8__neon() 203 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0); in xnn_q8_gemm_ukernel_8x8__neon() [all …]
|
D | 4x4c2-sse2.c | 70 const __m128i vxa1 = _mm_unpacklo_epi8(va1, vzero); in xnn_q8_gemm_ukernel_4x4c2__sse2() local 85 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_q8_gemm_ukernel_4x4c2__sse2() 97 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_q8_gemm_ukernel_4x4c2__sse2() 109 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_q8_gemm_ukernel_4x4c2__sse2() 122 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3)); in xnn_q8_gemm_ukernel_4x4c2__sse2() 136 const __m128i vxa1 = _mm_unpacklo_epi8(va1, vzero); in xnn_q8_gemm_ukernel_4x4c2__sse2() local 151 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_q8_gemm_ukernel_4x4c2__sse2() 165 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_q8_gemm_ukernel_4x4c2__sse2() 179 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_q8_gemm_ukernel_4x4c2__sse2() 193 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3)); in xnn_q8_gemm_ukernel_4x4c2__sse2()
|
D | 2x4c8-sse2.c | 81 const __m128i vxa1 = _mm_unpacklo_epi8(va1, vzero); in xnn_q8_gemm_ukernel_2x4c8__sse2() local 98 vacc10 = _mm_add_epi32(vacc10, _mm_madd_epi16(vxa1, vxb0)); in xnn_q8_gemm_ukernel_2x4c8__sse2() 99 vacc11 = _mm_add_epi32(vacc11, _mm_madd_epi16(vxa1, vxb1)); in xnn_q8_gemm_ukernel_2x4c8__sse2() 100 vacc12 = _mm_add_epi32(vacc12, _mm_madd_epi16(vxa1, vxb2)); in xnn_q8_gemm_ukernel_2x4c8__sse2() 101 vacc13 = _mm_add_epi32(vacc13, _mm_madd_epi16(vxa1, vxb3)); in xnn_q8_gemm_ukernel_2x4c8__sse2()
|
/external/XNNPACK/src/q8-vadd/ |
D | neon.c | 41 const int16x8_t vxa1 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(va01), va_zero_point)); in xnn_q8_vadd_ukernel__neon() local 50 int32x4_t vacc1_lo = vmulq_s32(vmovl_s16(vget_low_s16(vxa1)), va_multiplier); in xnn_q8_vadd_ukernel__neon() 54 int32x4_t vacc1_hi = vmulq_s32(vmovl_high_s16(vxa1), va_multiplier); in xnn_q8_vadd_ukernel__neon() 111 const int16x8_t vxa1 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(va01), va_zero_point)); in xnn_q8_vadd_ukernel__neon() local 116 int32x4_t vacc1_lo = vmulq_s32(vmovl_s16(vget_low_s16(vxa1)), va_multiplier); in xnn_q8_vadd_ukernel__neon() 118 int32x4_t vacc1_hi = vmulq_s32(vmovl_s16(vget_high_s16(vxa1)), va_multiplier); in xnn_q8_vadd_ukernel__neon()
|