Home
last modified time | relevance | path

Searched refs:vxa1 (Results 1 – 8 of 8) sorted by relevance

/external/XNNPACK/src/q8-igemm/
D4x8-neon.c91 const int16x8_t vxa1 = vreinterpretq_s16_u16(vmovl_u8(va1)); in xnn_q8_igemm_ukernel_4x8__neon() local
101 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 0); in xnn_q8_igemm_ukernel_4x8__neon()
102 … vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 0); in xnn_q8_igemm_ukernel_4x8__neon()
115 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 1); in xnn_q8_igemm_ukernel_4x8__neon()
116 … vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 1); in xnn_q8_igemm_ukernel_4x8__neon()
129 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 2); in xnn_q8_igemm_ukernel_4x8__neon()
130 … vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 2); in xnn_q8_igemm_ukernel_4x8__neon()
143 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 3); in xnn_q8_igemm_ukernel_4x8__neon()
144 … vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 3); in xnn_q8_igemm_ukernel_4x8__neon()
157 … vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa1), 0); in xnn_q8_igemm_ukernel_4x8__neon()
[all …]
D8x8-neon.c135 const int16x8_t vxa1 = vreinterpretq_s16_u16(vmovl_u8(va1)); in xnn_q8_igemm_ukernel_8x8__neon() local
149 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 0); in xnn_q8_igemm_ukernel_8x8__neon()
150 … vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 0); in xnn_q8_igemm_ukernel_8x8__neon()
171 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 1); in xnn_q8_igemm_ukernel_8x8__neon()
172 … vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 1); in xnn_q8_igemm_ukernel_8x8__neon()
193 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 2); in xnn_q8_igemm_ukernel_8x8__neon()
194 … vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 2); in xnn_q8_igemm_ukernel_8x8__neon()
215 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 3); in xnn_q8_igemm_ukernel_8x8__neon()
216 … vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 3); in xnn_q8_igemm_ukernel_8x8__neon()
237 … vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa1), 0); in xnn_q8_igemm_ukernel_8x8__neon()
[all …]
D4x4c2-sse2.c87 const __m128i vxa1 = _mm_unpacklo_epi8(va1, vzero); in xnn_q8_igemm_ukernel_4x4c2__sse2() local
99 …vacc1x0123 = _mm_add_epi32(vacc1x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0,… in xnn_q8_igemm_ukernel_4x4c2__sse2()
106 …vacc1x0123 = _mm_add_epi32(vacc1x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1,… in xnn_q8_igemm_ukernel_4x4c2__sse2()
113 …vacc1x0123 = _mm_add_epi32(vacc1x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2,… in xnn_q8_igemm_ukernel_4x4c2__sse2()
120 …vacc1x0123 = _mm_add_epi32(vacc1x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3,… in xnn_q8_igemm_ukernel_4x4c2__sse2()
132 const __m128i vxa1 = _mm_unpacklo_epi8(va1, vzero); in xnn_q8_igemm_ukernel_4x4c2__sse2() local
143 …vacc1x0123 = _mm_add_epi32(vacc1x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0,… in xnn_q8_igemm_ukernel_4x4c2__sse2()
153 …vacc1x0123 = _mm_add_epi32(vacc1x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1,… in xnn_q8_igemm_ukernel_4x4c2__sse2()
163 …vacc1x0123 = _mm_add_epi32(vacc1x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2,… in xnn_q8_igemm_ukernel_4x4c2__sse2()
173 …vacc1x0123 = _mm_add_epi32(vacc1x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3,… in xnn_q8_igemm_ukernel_4x4c2__sse2()
/external/XNNPACK/src/q8-gemm/
D4x8-neon.c72 const int16x8_t vxa1 = vreinterpretq_s16_u16(vmovl_u8(va1)); in xnn_q8_gemm_ukernel_4x8__neon() local
83 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0); in xnn_q8_gemm_ukernel_4x8__neon()
84 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0); in xnn_q8_gemm_ukernel_4x8__neon()
95 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1); in xnn_q8_gemm_ukernel_4x8__neon()
96 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1); in xnn_q8_gemm_ukernel_4x8__neon()
107 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2); in xnn_q8_gemm_ukernel_4x8__neon()
108 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2); in xnn_q8_gemm_ukernel_4x8__neon()
119 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3); in xnn_q8_gemm_ukernel_4x8__neon()
120 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3); in xnn_q8_gemm_ukernel_4x8__neon()
131 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0); in xnn_q8_gemm_ukernel_4x8__neon()
[all …]
D8x8-neon.c104 const int16x8_t vxa1 = vreinterpretq_s16_u16(vmovl_u8(va1)); a1 += 8; in xnn_q8_gemm_ukernel_8x8__neon() local
123 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0); in xnn_q8_gemm_ukernel_8x8__neon()
124 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0); in xnn_q8_gemm_ukernel_8x8__neon()
143 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1); in xnn_q8_gemm_ukernel_8x8__neon()
144 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1); in xnn_q8_gemm_ukernel_8x8__neon()
163 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2); in xnn_q8_gemm_ukernel_8x8__neon()
164 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2); in xnn_q8_gemm_ukernel_8x8__neon()
183 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3); in xnn_q8_gemm_ukernel_8x8__neon()
184 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3); in xnn_q8_gemm_ukernel_8x8__neon()
203 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0); in xnn_q8_gemm_ukernel_8x8__neon()
[all …]
D4x4c2-sse2.c70 const __m128i vxa1 = _mm_unpacklo_epi8(va1, vzero); in xnn_q8_gemm_ukernel_4x4c2__sse2() local
85 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_q8_gemm_ukernel_4x4c2__sse2()
97 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_q8_gemm_ukernel_4x4c2__sse2()
109 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_q8_gemm_ukernel_4x4c2__sse2()
122 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3)); in xnn_q8_gemm_ukernel_4x4c2__sse2()
136 const __m128i vxa1 = _mm_unpacklo_epi8(va1, vzero); in xnn_q8_gemm_ukernel_4x4c2__sse2() local
151 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_q8_gemm_ukernel_4x4c2__sse2()
165 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_q8_gemm_ukernel_4x4c2__sse2()
179 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_q8_gemm_ukernel_4x4c2__sse2()
193 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3)); in xnn_q8_gemm_ukernel_4x4c2__sse2()
D2x4c8-sse2.c81 const __m128i vxa1 = _mm_unpacklo_epi8(va1, vzero); in xnn_q8_gemm_ukernel_2x4c8__sse2() local
98 vacc10 = _mm_add_epi32(vacc10, _mm_madd_epi16(vxa1, vxb0)); in xnn_q8_gemm_ukernel_2x4c8__sse2()
99 vacc11 = _mm_add_epi32(vacc11, _mm_madd_epi16(vxa1, vxb1)); in xnn_q8_gemm_ukernel_2x4c8__sse2()
100 vacc12 = _mm_add_epi32(vacc12, _mm_madd_epi16(vxa1, vxb2)); in xnn_q8_gemm_ukernel_2x4c8__sse2()
101 vacc13 = _mm_add_epi32(vacc13, _mm_madd_epi16(vxa1, vxb3)); in xnn_q8_gemm_ukernel_2x4c8__sse2()
/external/XNNPACK/src/q8-vadd/
Dneon.c41 const int16x8_t vxa1 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(va01), va_zero_point)); in xnn_q8_vadd_ukernel__neon() local
50 int32x4_t vacc1_lo = vmulq_s32(vmovl_s16(vget_low_s16(vxa1)), va_multiplier); in xnn_q8_vadd_ukernel__neon()
54 int32x4_t vacc1_hi = vmulq_s32(vmovl_high_s16(vxa1), va_multiplier); in xnn_q8_vadd_ukernel__neon()
111 const int16x8_t vxa1 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(va01), va_zero_point)); in xnn_q8_vadd_ukernel__neon() local
116 int32x4_t vacc1_lo = vmulq_s32(vmovl_s16(vget_low_s16(vxa1)), va_multiplier); in xnn_q8_vadd_ukernel__neon()
118 int32x4_t vacc1_hi = vmulq_s32(vmovl_s16(vget_high_s16(vxa1)), va_multiplier); in xnn_q8_vadd_ukernel__neon()