/external/XNNPACK/src/u8-maxpool/ |
D | 9p8x-minmax-neon-c16.c | 124 uint8x8_t vout_lo = vget_low_u8(vout); in xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16() local 126 vst1_u8(o, vout_lo); o += 8; in xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16() 127 vout_lo = vget_high_u8(vout); in xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16() 130 vst1_lane_u32(__builtin_assume_aligned(o, 1), vreinterpret_u32_u8(vout_lo), 0); o += 4; in xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16() 131 vout_lo = vext_u8(vout_lo, vout_lo, 4); in xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16() 134 vst1_lane_u16(__builtin_assume_aligned(o, 1), vreinterpret_u16_u8(vout_lo), 0); o += 2; in xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16() 135 vout_lo = vext_u8(vout_lo, vout_lo, 2); in xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16() 138 vst1_lane_u8(o, vout_lo, 0); o += 1; in xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16() 228 uint8x8_t vout_lo = vget_low_u8(vout); in xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16() local 230 vst1_u8(o, vout_lo); o += 8; in xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16() [all …]
|
/external/XNNPACK/src/f16-gavgpool/ |
D | 7x-minmax-neonfp16arith-c8.c | 104 float16x4_t vout_lo = vget_low_f16(vout); in xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8() local 106 vst1_f16(output, vout_lo); output += 4; in xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8() 107 vout_lo = vget_high_f16(vout); in xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8() 110 … vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_f16(vout_lo), 0); output += 2; in xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8() 111 vout_lo = vext_f16(vout_lo, vout_lo, 2); in xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8() 114 vst1_lane_f16(output, vout_lo, 0); in xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8()
|
D | 7p7x-minmax-neonfp16arith-c8.c | 176 float16x4_t vout_lo = vget_low_f16(vout); in xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8() local 178 vst1_f16(output, vout_lo); output += 4; in xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8() 179 vout_lo = vget_high_f16(vout); in xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8() 182 … vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_f16(vout_lo), 0); output += 2; in xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8() 183 vout_lo = vext_f16(vout_lo, vout_lo, 2); in xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8() 186 vst1_lane_f16(output, vout_lo, 0); in xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8()
|
/external/XNNPACK/src/qs8-gemm/gen/ |
D | 1x8c8-xw-minmax-avx2.c | 123 __m128i vout_lo = _mm256_castsi256_si128(vout); in xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2() local 127 _mm_storel_epi64((__m128i*) c0, vout_lo); in xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2() 136 _mm_storeu_si32(c0, vout_lo); in xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2() 140 vout_lo = _mm_srli_epi64(vout_lo, 32); in xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2() 144 *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0); in xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2() 148 vout_lo = _mm_srli_epi32(vout_lo, 16); in xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2() 152 *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0); in xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2()
|
D | 1x8c8-minmax-avx2.c | 127 __m128i vout_lo = _mm256_castsi256_si128(vout); in xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2() local 131 _mm_storel_epi64((__m128i*) c0, vout_lo); in xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2() 140 _mm_storeu_si32(c0, vout_lo); in xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2() 144 vout_lo = _mm_srli_epi64(vout_lo, 32); in xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2() 148 *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0); in xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2() 152 vout_lo = _mm_srli_epi32(vout_lo, 16); in xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2() 156 *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0); in xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2()
|
D | 3x8c8-minmax-avx2.c | 192 __m128i vout_lo = _mm256_castsi256_si128(vout); in xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2() local 196 _mm_storel_epi64((__m128i*) c0, vout_lo); in xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2() 198 _mm_storeh_pi((__m64*) c2, _mm_castsi128_ps(vout_lo)); in xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2() 211 _mm_storeu_si32(c0, vout_lo); in xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2() 213 *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout_lo, 2); in xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2() 219 vout_lo = _mm_srli_epi64(vout_lo, 32); in xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2() 223 *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0); in xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2() 225 *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout_lo, 4); in xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2() 231 vout_lo = _mm_srli_epi32(vout_lo, 16); in xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2() 235 *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0); in xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2() [all …]
|
D | 3x8c8-xw-minmax-avx2.c | 188 __m128i vout_lo = _mm256_castsi256_si128(vout); in xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2() local 192 _mm_storel_epi64((__m128i*) c0, vout_lo); in xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2() 194 _mm_storeh_pi((__m64*) c2, _mm_castsi128_ps(vout_lo)); in xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2() 207 _mm_storeu_si32(c0, vout_lo); in xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2() 209 *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout_lo, 2); in xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2() 215 vout_lo = _mm_srli_epi64(vout_lo, 32); in xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2() 219 *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0); in xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2() 221 *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout_lo, 4); in xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2() 227 vout_lo = _mm_srli_epi32(vout_lo, 16); in xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2() 231 *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0); in xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2() [all …]
|
D | 2x8c8-xw-minmax-avx2.c | 154 __m128i vout_lo = _mm256_castsi256_si128(vout); in xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2() local 158 _mm_storel_epi64((__m128i*) c0, vout_lo); in xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2() 170 _mm_storeu_si32(c0, vout_lo); in xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2() 176 vout_lo = _mm_srli_epi64(vout_lo, 32); in xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2() 180 *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0); in xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2() 186 vout_lo = _mm_srli_epi32(vout_lo, 16); in xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2() 190 *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0); in xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2()
|
D | 2x8c8-minmax-avx2.c | 158 __m128i vout_lo = _mm256_castsi256_si128(vout); in xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2() local 162 _mm_storel_epi64((__m128i*) c0, vout_lo); in xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2() 174 _mm_storeu_si32(c0, vout_lo); in xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2() 180 vout_lo = _mm_srli_epi64(vout_lo, 32); in xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2() 184 *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0); in xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2() 190 vout_lo = _mm_srli_epi32(vout_lo, 16); in xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2() 194 *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0); in xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2()
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 3x8c8-minmax-avx2.c | 209 __m128i vout_lo = _mm256_castsi256_si128(vout); in xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2() local 213 _mm_storeh_pi((__m64*) c2, _mm_castsi128_ps(vout_lo)); in xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2() 215 _mm_storel_epi64((__m128i*) c0, vout_lo); in xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2() 226 *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout_lo, 2); in xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2() 228 _mm_storeu_si32(c0, vout_lo); in xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2() 234 vout_lo = _mm_srli_epi64(vout_lo, 32); in xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2() 238 *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout_lo, 4); in xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2() 240 *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0); in xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2() 246 vout_lo = _mm_srli_epi32(vout_lo, 16); in xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2() 250 *c2 = (uint8_t) _mm_extract_epi8(vout_lo, 8); in xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2() [all …]
|
D | 1x8c8-minmax-avx2.c | 140 __m128i vout_lo = _mm256_castsi256_si128(vout); in xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2() local 144 _mm_storel_epi64((__m128i*) c0, vout_lo); in xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2() 153 _mm_storeu_si32(c0, vout_lo); in xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2() 157 vout_lo = _mm_srli_epi64(vout_lo, 32); in xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2() 161 *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0); in xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2() 165 vout_lo = _mm_srli_epi32(vout_lo, 16); in xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2() 169 *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0); in xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2()
|
D | 2x8c8-minmax-avx2.c | 173 __m128i vout_lo = _mm256_castsi256_si128(vout); in xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2() local 178 _mm_storel_epi64((__m128i*) c0, vout_lo); in xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2() 189 _mm_storeu_si32(c0, vout_lo); in xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2() 194 vout_lo = _mm_srli_epi64(vout_lo, 32); in xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2() 199 *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0); in xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2() 204 vout_lo = _mm_srli_epi32(vout_lo, 16); in xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2() 209 *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0); in xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2()
|
/external/XNNPACK/src/qs8-igemm/ |
D | MRx8c8-avx2.c.in | 160 __m128i vout_lo = _mm256_castsi256_si128(vout); local 167 _mm_storeh_pi((__m64*) c2, _mm_castsi128_ps(vout_lo)); 170 _mm_storel_epi64((__m128i*) c0, vout_lo); 183 *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout_lo, 2); 186 _mm_storeu_si32(c0, vout_lo); 191 vout_lo = _mm_srli_epi64(vout_lo, 32); 198 *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout_lo, 4); 201 *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0); 206 vout_lo = _mm_srli_epi32(vout_lo, 16); 213 *c2 = (uint8_t) _mm_extract_epi8(vout_lo, 8); [all …]
|
/external/XNNPACK/src/qs8-gemm/ |
D | MRx8c8-avx2.c.in | 161 __m128i vout_lo = _mm256_castsi256_si128(vout); 165 _mm_storel_epi64((__m128i*) c0, vout_lo); 169 _mm_storeh_pi((__m64*) c2, _mm_castsi128_ps(vout_lo)); 182 _mm_storeu_si32(c0, vout_lo); 186 *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout_lo, 2); 193 vout_lo = _mm_srli_epi64(vout_lo, 32); 197 *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0); 201 *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout_lo, 4); 208 vout_lo = _mm_srli_epi32(vout_lo, 16); 212 *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0); [all …]
|
/external/XNNPACK/src/f32-maxpool/ |
D | 9p8x-minmax-neon-c4.c | 140 float32x2_t vout_lo = vget_low_f32(vout); in xnn_f32_maxpool_minmax_ukernel_9p8x__neon_c4() local 142 vst1_f32(o, vout_lo); in xnn_f32_maxpool_minmax_ukernel_9p8x__neon_c4() 143 vout_lo = vget_high_f32(vout); in xnn_f32_maxpool_minmax_ukernel_9p8x__neon_c4() 147 vst1_lane_f32(o, vout_lo, 0); in xnn_f32_maxpool_minmax_ukernel_9p8x__neon_c4() 247 float32x2_t vout_lo = vget_low_f32(vout); in xnn_f32_maxpool_minmax_ukernel_9p8x__neon_c4() local 249 vst1_f32(o, vout_lo); in xnn_f32_maxpool_minmax_ukernel_9p8x__neon_c4() 250 vout_lo = vget_high_f32(vout); in xnn_f32_maxpool_minmax_ukernel_9p8x__neon_c4() 254 vst1_lane_f32(o, vout_lo, 0); in xnn_f32_maxpool_minmax_ukernel_9p8x__neon_c4()
|
/external/XNNPACK/src/f32-gavgpool/ |
D | 7x-minmax-neon-c4.c | 103 float32x2_t vout_lo = vget_low_f32(vout); in xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4() local 105 vst1_f32(output, vout_lo); output += 2; in xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4() 106 vout_lo = vget_high_f32(vout); in xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4() 109 vst1_lane_f32(output, vout_lo, 0); in xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4()
|
D | 7p7x-minmax-neon-c4.c | 175 float32x2_t vout_lo = vget_low_f32(vout); in xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4() local 177 vst1_f32(output, vout_lo); output += 2; in xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4() 178 vout_lo = vget_high_f32(vout); in xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4() 181 vst1_lane_f32(output, vout_lo, 0); in xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4()
|
/external/XNNPACK/src/f32-avgpool/ |
D | 9x-minmax-neon-c4.c | 159 float32x2_t vout_lo = vget_low_f32(vout); in xnn_f32_avgpool_minmax_ukernel_9x__neon_c4() local 161 vst1_f32(output, vout_lo); output += 2; in xnn_f32_avgpool_minmax_ukernel_9x__neon_c4() 162 vout_lo = vget_high_f32(vout); in xnn_f32_avgpool_minmax_ukernel_9x__neon_c4() 165 vst1_lane_f32(output, vout_lo, 0); output += 1; in xnn_f32_avgpool_minmax_ukernel_9x__neon_c4()
|
D | 9p8x-minmax-neon-c4.c | 294 float32x2_t vout_lo = vget_low_f32(vout); in xnn_f32_avgpool_minmax_ukernel_9p8x__neon_c4() local 296 vst1_f32(output, vout_lo); output += 2; in xnn_f32_avgpool_minmax_ukernel_9p8x__neon_c4() 297 vout_lo = vget_high_f32(vout); in xnn_f32_avgpool_minmax_ukernel_9p8x__neon_c4() 300 vst1_lane_f32(output, vout_lo, 0); output += 1; in xnn_f32_avgpool_minmax_ukernel_9p8x__neon_c4()
|
/external/XNNPACK/src/f32-pavgpool/ |
D | 9x-minmax-neon-c4.c | 161 float32x2_t vout_lo = vget_low_f32(vout); in xnn_f32_pavgpool_minmax_ukernel_9x__neon_c4() local 163 vst1_f32(output, vout_lo); output += 2; in xnn_f32_pavgpool_minmax_ukernel_9x__neon_c4() 164 vout_lo = vget_high_f32(vout); in xnn_f32_pavgpool_minmax_ukernel_9x__neon_c4() 167 vst1_lane_f32(output, vout_lo, 0); output += 1; in xnn_f32_pavgpool_minmax_ukernel_9x__neon_c4()
|
D | 9p8x-minmax-neon-c4.c | 295 float32x2_t vout_lo = vget_low_f32(vout); in xnn_f32_pavgpool_minmax_ukernel_9p8x__neon_c4() local 297 vst1_f32(output, vout_lo); output += 2; in xnn_f32_pavgpool_minmax_ukernel_9p8x__neon_c4() 298 vout_lo = vget_high_f32(vout); in xnn_f32_pavgpool_minmax_ukernel_9p8x__neon_c4() 301 vst1_lane_f32(output, vout_lo, 0); output += 1; in xnn_f32_pavgpool_minmax_ukernel_9p8x__neon_c4()
|
/external/XNNPACK/src/qu8-dwconv/ |
D | up8x9-minmax-sse2.c | 212 …const __m128i vout_lo = _mm_sub_epi32(_mm_sra_epi32(vq31prod_lo0123, vshift), _mm_cmpgt_epi32(vrem… in xnn_qu8_dwconv_minmax_ukernel_up8x9__sse2() local 216 __m128i vout = _mm_adds_epi16(_mm_packs_epi32(vout_lo, vout_hi), voutput_zero_point); in xnn_qu8_dwconv_minmax_ukernel_up8x9__sse2() 362 …const __m128i vout_lo = _mm_sub_epi32(_mm_sra_epi32(vq31prod_lo0123, vshift), _mm_cmpgt_epi32(vrem… in xnn_qu8_dwconv_minmax_ukernel_up8x9__sse2() local 366 __m128i vout = _mm_adds_epi16(_mm_packs_epi32(vout_lo, vout_hi), voutput_zero_point); in xnn_qu8_dwconv_minmax_ukernel_up8x9__sse2()
|