Home
last modified time | relevance | path

Searched refs:vout_lo (Results 1 – 22 of 22) sorted by relevance

/external/XNNPACK/src/u8-maxpool/
D9p8x-minmax-neon-c16.c124 uint8x8_t vout_lo = vget_low_u8(vout); in xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16() local
126 vst1_u8(o, vout_lo); o += 8; in xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16()
127 vout_lo = vget_high_u8(vout); in xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16()
130 vst1_lane_u32(__builtin_assume_aligned(o, 1), vreinterpret_u32_u8(vout_lo), 0); o += 4; in xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16()
131 vout_lo = vext_u8(vout_lo, vout_lo, 4); in xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16()
134 vst1_lane_u16(__builtin_assume_aligned(o, 1), vreinterpret_u16_u8(vout_lo), 0); o += 2; in xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16()
135 vout_lo = vext_u8(vout_lo, vout_lo, 2); in xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16()
138 vst1_lane_u8(o, vout_lo, 0); o += 1; in xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16()
228 uint8x8_t vout_lo = vget_low_u8(vout); in xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16() local
230 vst1_u8(o, vout_lo); o += 8; in xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16()
[all …]
/external/XNNPACK/src/f16-gavgpool/
D7x-minmax-neonfp16arith-c8.c104 float16x4_t vout_lo = vget_low_f16(vout); in xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8() local
106 vst1_f16(output, vout_lo); output += 4; in xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8()
107 vout_lo = vget_high_f16(vout); in xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8()
110 … vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_f16(vout_lo), 0); output += 2; in xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8()
111 vout_lo = vext_f16(vout_lo, vout_lo, 2); in xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8()
114 vst1_lane_f16(output, vout_lo, 0); in xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8()
D7p7x-minmax-neonfp16arith-c8.c176 float16x4_t vout_lo = vget_low_f16(vout); in xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8() local
178 vst1_f16(output, vout_lo); output += 4; in xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8()
179 vout_lo = vget_high_f16(vout); in xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8()
182 … vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_f16(vout_lo), 0); output += 2; in xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8()
183 vout_lo = vext_f16(vout_lo, vout_lo, 2); in xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8()
186 vst1_lane_f16(output, vout_lo, 0); in xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8()
/external/XNNPACK/src/qs8-gemm/gen/
D1x8c8-xw-minmax-avx2.c123 __m128i vout_lo = _mm256_castsi256_si128(vout); in xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2() local
127 _mm_storel_epi64((__m128i*) c0, vout_lo); in xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2()
136 _mm_storeu_si32(c0, vout_lo); in xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2()
140 vout_lo = _mm_srli_epi64(vout_lo, 32); in xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2()
144 *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0); in xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2()
148 vout_lo = _mm_srli_epi32(vout_lo, 16); in xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2()
152 *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0); in xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2()
D1x8c8-minmax-avx2.c127 __m128i vout_lo = _mm256_castsi256_si128(vout); in xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2() local
131 _mm_storel_epi64((__m128i*) c0, vout_lo); in xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2()
140 _mm_storeu_si32(c0, vout_lo); in xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2()
144 vout_lo = _mm_srli_epi64(vout_lo, 32); in xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2()
148 *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0); in xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2()
152 vout_lo = _mm_srli_epi32(vout_lo, 16); in xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2()
156 *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0); in xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2()
D3x8c8-minmax-avx2.c192 __m128i vout_lo = _mm256_castsi256_si128(vout); in xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2() local
196 _mm_storel_epi64((__m128i*) c0, vout_lo); in xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2()
198 _mm_storeh_pi((__m64*) c2, _mm_castsi128_ps(vout_lo)); in xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2()
211 _mm_storeu_si32(c0, vout_lo); in xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2()
213 *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout_lo, 2); in xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2()
219 vout_lo = _mm_srli_epi64(vout_lo, 32); in xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2()
223 *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0); in xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2()
225 *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout_lo, 4); in xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2()
231 vout_lo = _mm_srli_epi32(vout_lo, 16); in xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2()
235 *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0); in xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2()
[all …]
D3x8c8-xw-minmax-avx2.c188 __m128i vout_lo = _mm256_castsi256_si128(vout); in xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2() local
192 _mm_storel_epi64((__m128i*) c0, vout_lo); in xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2()
194 _mm_storeh_pi((__m64*) c2, _mm_castsi128_ps(vout_lo)); in xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2()
207 _mm_storeu_si32(c0, vout_lo); in xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2()
209 *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout_lo, 2); in xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2()
215 vout_lo = _mm_srli_epi64(vout_lo, 32); in xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2()
219 *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0); in xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2()
221 *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout_lo, 4); in xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2()
227 vout_lo = _mm_srli_epi32(vout_lo, 16); in xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2()
231 *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0); in xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2()
[all …]
D2x8c8-xw-minmax-avx2.c154 __m128i vout_lo = _mm256_castsi256_si128(vout); in xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2() local
158 _mm_storel_epi64((__m128i*) c0, vout_lo); in xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2()
170 _mm_storeu_si32(c0, vout_lo); in xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2()
176 vout_lo = _mm_srli_epi64(vout_lo, 32); in xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2()
180 *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0); in xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2()
186 vout_lo = _mm_srli_epi32(vout_lo, 16); in xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2()
190 *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0); in xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2()
D2x8c8-minmax-avx2.c158 __m128i vout_lo = _mm256_castsi256_si128(vout); in xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2() local
162 _mm_storel_epi64((__m128i*) c0, vout_lo); in xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2()
174 _mm_storeu_si32(c0, vout_lo); in xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2()
180 vout_lo = _mm_srli_epi64(vout_lo, 32); in xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2()
184 *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0); in xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2()
190 vout_lo = _mm_srli_epi32(vout_lo, 16); in xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2()
194 *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0); in xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2()
/external/XNNPACK/src/qs8-igemm/gen/
D3x8c8-minmax-avx2.c209 __m128i vout_lo = _mm256_castsi256_si128(vout); in xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2() local
213 _mm_storeh_pi((__m64*) c2, _mm_castsi128_ps(vout_lo)); in xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2()
215 _mm_storel_epi64((__m128i*) c0, vout_lo); in xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2()
226 *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout_lo, 2); in xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2()
228 _mm_storeu_si32(c0, vout_lo); in xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2()
234 vout_lo = _mm_srli_epi64(vout_lo, 32); in xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2()
238 *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout_lo, 4); in xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2()
240 *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0); in xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2()
246 vout_lo = _mm_srli_epi32(vout_lo, 16); in xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2()
250 *c2 = (uint8_t) _mm_extract_epi8(vout_lo, 8); in xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2()
[all …]
D1x8c8-minmax-avx2.c140 __m128i vout_lo = _mm256_castsi256_si128(vout); in xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2() local
144 _mm_storel_epi64((__m128i*) c0, vout_lo); in xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2()
153 _mm_storeu_si32(c0, vout_lo); in xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2()
157 vout_lo = _mm_srli_epi64(vout_lo, 32); in xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2()
161 *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0); in xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2()
165 vout_lo = _mm_srli_epi32(vout_lo, 16); in xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2()
169 *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0); in xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2()
D2x8c8-minmax-avx2.c173 __m128i vout_lo = _mm256_castsi256_si128(vout); in xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2() local
178 _mm_storel_epi64((__m128i*) c0, vout_lo); in xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2()
189 _mm_storeu_si32(c0, vout_lo); in xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2()
194 vout_lo = _mm_srli_epi64(vout_lo, 32); in xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2()
199 *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0); in xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2()
204 vout_lo = _mm_srli_epi32(vout_lo, 16); in xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2()
209 *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0); in xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2()
/external/XNNPACK/src/qs8-igemm/
DMRx8c8-avx2.c.in160 __m128i vout_lo = _mm256_castsi256_si128(vout); local
167 _mm_storeh_pi((__m64*) c2, _mm_castsi128_ps(vout_lo));
170 _mm_storel_epi64((__m128i*) c0, vout_lo);
183 *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout_lo, 2);
186 _mm_storeu_si32(c0, vout_lo);
191 vout_lo = _mm_srli_epi64(vout_lo, 32);
198 *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout_lo, 4);
201 *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
206 vout_lo = _mm_srli_epi32(vout_lo, 16);
213 *c2 = (uint8_t) _mm_extract_epi8(vout_lo, 8);
[all …]
/external/XNNPACK/src/qs8-gemm/
DMRx8c8-avx2.c.in161 __m128i vout_lo = _mm256_castsi256_si128(vout);
165 _mm_storel_epi64((__m128i*) c0, vout_lo);
169 _mm_storeh_pi((__m64*) c2, _mm_castsi128_ps(vout_lo));
182 _mm_storeu_si32(c0, vout_lo);
186 *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout_lo, 2);
193 vout_lo = _mm_srli_epi64(vout_lo, 32);
197 *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
201 *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout_lo, 4);
208 vout_lo = _mm_srli_epi32(vout_lo, 16);
212 *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
[all …]
/external/XNNPACK/src/f32-maxpool/
D9p8x-minmax-neon-c4.c140 float32x2_t vout_lo = vget_low_f32(vout); in xnn_f32_maxpool_minmax_ukernel_9p8x__neon_c4() local
142 vst1_f32(o, vout_lo); in xnn_f32_maxpool_minmax_ukernel_9p8x__neon_c4()
143 vout_lo = vget_high_f32(vout); in xnn_f32_maxpool_minmax_ukernel_9p8x__neon_c4()
147 vst1_lane_f32(o, vout_lo, 0); in xnn_f32_maxpool_minmax_ukernel_9p8x__neon_c4()
247 float32x2_t vout_lo = vget_low_f32(vout); in xnn_f32_maxpool_minmax_ukernel_9p8x__neon_c4() local
249 vst1_f32(o, vout_lo); in xnn_f32_maxpool_minmax_ukernel_9p8x__neon_c4()
250 vout_lo = vget_high_f32(vout); in xnn_f32_maxpool_minmax_ukernel_9p8x__neon_c4()
254 vst1_lane_f32(o, vout_lo, 0); in xnn_f32_maxpool_minmax_ukernel_9p8x__neon_c4()
/external/XNNPACK/src/f32-gavgpool/
D7x-minmax-neon-c4.c103 float32x2_t vout_lo = vget_low_f32(vout); in xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4() local
105 vst1_f32(output, vout_lo); output += 2; in xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4()
106 vout_lo = vget_high_f32(vout); in xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4()
109 vst1_lane_f32(output, vout_lo, 0); in xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4()
D7p7x-minmax-neon-c4.c175 float32x2_t vout_lo = vget_low_f32(vout); in xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4() local
177 vst1_f32(output, vout_lo); output += 2; in xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4()
178 vout_lo = vget_high_f32(vout); in xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4()
181 vst1_lane_f32(output, vout_lo, 0); in xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4()
/external/XNNPACK/src/f32-avgpool/
D9x-minmax-neon-c4.c159 float32x2_t vout_lo = vget_low_f32(vout); in xnn_f32_avgpool_minmax_ukernel_9x__neon_c4() local
161 vst1_f32(output, vout_lo); output += 2; in xnn_f32_avgpool_minmax_ukernel_9x__neon_c4()
162 vout_lo = vget_high_f32(vout); in xnn_f32_avgpool_minmax_ukernel_9x__neon_c4()
165 vst1_lane_f32(output, vout_lo, 0); output += 1; in xnn_f32_avgpool_minmax_ukernel_9x__neon_c4()
D9p8x-minmax-neon-c4.c294 float32x2_t vout_lo = vget_low_f32(vout); in xnn_f32_avgpool_minmax_ukernel_9p8x__neon_c4() local
296 vst1_f32(output, vout_lo); output += 2; in xnn_f32_avgpool_minmax_ukernel_9p8x__neon_c4()
297 vout_lo = vget_high_f32(vout); in xnn_f32_avgpool_minmax_ukernel_9p8x__neon_c4()
300 vst1_lane_f32(output, vout_lo, 0); output += 1; in xnn_f32_avgpool_minmax_ukernel_9p8x__neon_c4()
/external/XNNPACK/src/f32-pavgpool/
D9x-minmax-neon-c4.c161 float32x2_t vout_lo = vget_low_f32(vout); in xnn_f32_pavgpool_minmax_ukernel_9x__neon_c4() local
163 vst1_f32(output, vout_lo); output += 2; in xnn_f32_pavgpool_minmax_ukernel_9x__neon_c4()
164 vout_lo = vget_high_f32(vout); in xnn_f32_pavgpool_minmax_ukernel_9x__neon_c4()
167 vst1_lane_f32(output, vout_lo, 0); output += 1; in xnn_f32_pavgpool_minmax_ukernel_9x__neon_c4()
D9p8x-minmax-neon-c4.c295 float32x2_t vout_lo = vget_low_f32(vout); in xnn_f32_pavgpool_minmax_ukernel_9p8x__neon_c4() local
297 vst1_f32(output, vout_lo); output += 2; in xnn_f32_pavgpool_minmax_ukernel_9p8x__neon_c4()
298 vout_lo = vget_high_f32(vout); in xnn_f32_pavgpool_minmax_ukernel_9p8x__neon_c4()
301 vst1_lane_f32(output, vout_lo, 0); output += 1; in xnn_f32_pavgpool_minmax_ukernel_9p8x__neon_c4()
/external/XNNPACK/src/qu8-dwconv/
Dup8x9-minmax-sse2.c212 …const __m128i vout_lo = _mm_sub_epi32(_mm_sra_epi32(vq31prod_lo0123, vshift), _mm_cmpgt_epi32(vrem… in xnn_qu8_dwconv_minmax_ukernel_up8x9__sse2() local
216 __m128i vout = _mm_adds_epi16(_mm_packs_epi32(vout_lo, vout_hi), voutput_zero_point); in xnn_qu8_dwconv_minmax_ukernel_up8x9__sse2()
362 …const __m128i vout_lo = _mm_sub_epi32(_mm_sra_epi32(vq31prod_lo0123, vshift), _mm_cmpgt_epi32(vrem… in xnn_qu8_dwconv_minmax_ukernel_up8x9__sse2() local
366 __m128i vout = _mm_adds_epi16(_mm_packs_epi32(vout_lo, vout_hi), voutput_zero_point); in xnn_qu8_dwconv_minmax_ukernel_up8x9__sse2()