/external/XNNPACK/src/u8-clamp/ |
D | neon-x64.c | 44 uint8x8_t vout = vld1_u8(x); x += 8; in xnn_u8_clamp_ukernel__neon_x64() local 45 vout = vmin_u8(vout, vget_low_u8(voutput_max)); in xnn_u8_clamp_ukernel__neon_x64() 46 vout = vmax_u8(vout, vget_low_u8(voutput_min)); in xnn_u8_clamp_ukernel__neon_x64() 47 vst1_u8(y, vout); y += 8; in xnn_u8_clamp_ukernel__neon_x64() 50 uint8x8_t vout = vld1_u8(x); in xnn_u8_clamp_ukernel__neon_x64() local 51 vout = vmin_u8(vout, vget_low_u8(voutput_max)); in xnn_u8_clamp_ukernel__neon_x64() 52 vout = vmax_u8(vout, vget_low_u8(voutput_min)); in xnn_u8_clamp_ukernel__neon_x64() 55 vst1_lane_u32(__builtin_assume_aligned(y, 1), vreinterpret_u32_u8(vout), 0); y += 4; in xnn_u8_clamp_ukernel__neon_x64() 56 vout = vext_u8(vout, vout, 4); in xnn_u8_clamp_ukernel__neon_x64() 59 vst1_lane_u16(__builtin_assume_aligned(y, 1), vreinterpret_u16_u8(vout), 0); y += 2; in xnn_u8_clamp_ukernel__neon_x64() [all …]
|
D | sse2-x64.c | 45 __m128i vout = _mm_loadl_epi64((const __m128i*) x); in xnn_u8_clamp_ukernel__sse2_x64() local 47 vout = _mm_min_epu8(vout, voutput_max); in xnn_u8_clamp_ukernel__sse2_x64() 48 vout = _mm_max_epu8(vout, voutput_min); in xnn_u8_clamp_ukernel__sse2_x64() 49 _mm_storel_epi64((__m128i*) y, vout); in xnn_u8_clamp_ukernel__sse2_x64() 53 __m128i vout = _mm_loadl_epi64((const __m128i*) x); in xnn_u8_clamp_ukernel__sse2_x64() local 54 vout = _mm_min_epu8(vout, voutput_max); in xnn_u8_clamp_ukernel__sse2_x64() 55 vout = _mm_max_epu8(vout, voutput_min); in xnn_u8_clamp_ukernel__sse2_x64() 57 *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vout); in xnn_u8_clamp_ukernel__sse2_x64() 59 vout = _mm_srli_epi64(vout, 32); in xnn_u8_clamp_ukernel__sse2_x64() 62 *((uint16_t*) y) = (uint16_t) _mm_extract_epi16(vout, 0); in xnn_u8_clamp_ukernel__sse2_x64() [all …]
|
/external/XNNPACK/src/qs8-vaddc/ |
D | wasmsimd.c.in | 51 …v128_t vout${ABC[N:N+8]} = wasm_i16x8_add_saturate(wasm_i16x8_narrow_i32x4(vacc${ABC[N:N+4]}, vacc… 55 … v128_t vout${ABC[N:N+16]} = wasm_i8x16_narrow_i16x8(vout${ABC[N:N+8]}, vout${ABC[N+8:N+16]}); 57 …v128_t vout${ABC[N:N+8]}${ABC[N:N+8]} = wasm_i8x16_narrow_i16x8(vout${ABC[N:N+8]}, vout${ABC[N:N+8… 61 vout${ABC[N:N+16]} = wasm_i8x16_max(vout${ABC[N:N+16]}, voutput_min); 63 … vout${ABC[N:N+8]}${ABC[N:N+8]} = wasm_i8x16_max(vout${ABC[N:N+8]}${ABC[N:N+8]}, voutput_min); 67 vout${ABC[N:N+16]} = wasm_i8x16_min(vout${ABC[N:N+16]}, voutput_max); 69 … vout${ABC[N:N+8]}${ABC[N:N+8]} = wasm_i8x16_min(vout${ABC[N:N+8]}${ABC[N:N+8]}, voutput_max); 72 wasm_v128_store(output, vout${ABC[0:16]}); 74 *((double*) output) = wasm_f64x2_extract_lane(vout${ABC[0:8]}${ABC[0:8]}, 0); 77 wasm_v128_store(output + ${N}, vout${ABC[N:N+16]}); [all …]
|
D | avx2-mul32-ld64.c.in | 57 …__m256i vout${ABC[N:N+4]}${ABC[N+8:N+12]}${ABC[N+4:N+8]}${ABC[N+12:N+16]} = _mm256_adds_epi16(_mm2… 59 …__m128i vout${ABC[N:N+8]} = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc${ABC[N:N+8]… 61 …__m128i vout${ABC[N:N+8]} = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc${ABC[N:N+8]… 65 …vout${ABC[N:N+4]}${ABC[N+8:N+12]}${ABC[N+4:N+8]}${ABC[N+12:N+16]} = _mm256_min_epi16(_mm256_max_ep… 67 …vout${ABC[N:N+8]} = _mm_min_epi16(_mm_max_epi16(vout${ABC[N:N+8]}, _mm256_castsi256_si128(voutput_… 69 … vout${ABC[N:N+8]} = _mm_min_epi16(_mm_max_epi16(vout${ABC[N:N+8]}, voutput_min), voutput_max); 73 …vout${ABC[N:N+16]} = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(vout${ABC[N:N+4]}${A… 75 … __m128i vout${ABC[N:N+8]}${ABC[N:N+8]} = _mm_packs_epi16(vout${ABC[N:N+8]}, vout${ABC[N:N+8]}); 78 _mm_storeu_si128((__m128i*) output, vout${ABC[0:16]}); 80 _mm_storel_epi64((__m128i*) output, vout${ABC[0:8]}${ABC[0:8]}); [all …]
|
D | sse-mul32-ld32.c.in | 65 …__m128i vout${ABC[N:N+8]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[N:N+4]}, vacc${ABC[N+4:N+8]})… 68 vout${ABC[N:N+8]} = _mm_max_epi16(vout${ABC[N:N+8]}, voutput_min); 71 vout${ABC[N:N+8]} = _mm_min_epi16(vout${ABC[N:N+8]}, voutput_max); 75 const __m128i vout${ABC[N:N+16]} = _mm_packs_epi16(vout${ABC[N:N+8]}, vout${ABC[N+8:N+16]}); 77 …const __m128i vout${ABC[N:N+8]}${ABC[N:N+8]} = _mm_packs_epi16(vout${ABC[N:N+8]}, vout${ABC[N:N+8]… 80 _mm_storeu_si128((__m128i*) output, vout${ABC[0:16]}); 82 _mm_storel_epi64((__m128i*) output, vout${ABC[0:8]}${ABC[0:8]}); 85 _mm_storeu_si128((__m128i*) (output + ${N}), vout${ABC[N:N+16]}); 87 _mm_storel_epi64((__m128i*) (output + ${N}), vout${ABC[N:N+8]}${ABC[N:N+8]}); 110 …__m128i vout${ABC[0:8]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[0:4]}, vacc${ABC[4:8]}), voutpu… [all …]
|
D | neon-ld64.c.in | 60 …int8x16_t vout${ABC[N:N+16]} = vcombine_s8(vqmovn_s16(vacc${ABC[N:N+8]}), vqmovn_s16(vacc${ABC[N+8… 62 int8x8_t vout${ABC[N:N+8]} = vqmovn_s16(vacc${ABC[N:N+8]}); 66 vout${ABC[N:N+16]} = vmaxq_s8(vout${ABC[N:N+16]}, voutput_min); 68 vout${ABC[N:N+8]} = vmax_s8(vout${ABC[N:N+8]}, vget_low_s8(voutput_min)); 72 vout${ABC[N:N+16]} = vminq_s8(vout${ABC[N:N+16]}, voutput_max); 74 vout${ABC[N:N+8]} = vmin_s8(vout${ABC[N:N+8]}, vget_low_s8(voutput_max)); 78 vst1q_s8(output, vout${ABC[N:N+16]}); output += 16; 80 vst1_s8(output, vout${ABC[N:N+8]}); output += 8; 102 int8x8_t vout${ABC[0:8]} = vqmovn_s16(vacc${ABC[0:8]}); 103 vout${ABC[0:8]} = vmax_s8(vout${ABC[0:8]}, vget_low_s8(voutput_min)); [all …]
|
D | sse-mul16-ld64.c.in | 73 …__m128i vout${ABC[N:N+8]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[N:N+4]}, vacc${ABC[N+4:N+8]})… 76 vout${ABC[N:N+8]} = _mm_max_epi16(vout${ABC[N:N+8]}, voutput_min); 79 vout${ABC[N:N+8]} = _mm_min_epi16(vout${ABC[N:N+8]}, voutput_max); 83 const __m128i vout${ABC[N:N+16]} = _mm_packs_epi16(vout${ABC[N:N+8]}, vout${ABC[N+8:N+16]}); 85 …const __m128i vout${ABC[N:N+8]}${ABC[N:N+8]} = _mm_packs_epi16(vout${ABC[N:N+8]}, vout${ABC[N:N+8]… 88 _mm_storeu_si128((__m128i*) output, vout${ABC[0:16]}); 90 _mm_storel_epi64((__m128i*) output, vout${ABC[0:8]}${ABC[0:8]}); 93 _mm_storeu_si128((__m128i*) (output + ${N}), vout${ABC[N:N+16]}); 95 _mm_storel_epi64((__m128i*) (output + ${N}), vout${ABC[N:N+8]}${ABC[N:N+8]}); 126 …__m128i vout${ABC[0:8]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[0:4]}, vacc${ABC[4:8]}), voutpu… [all …]
|
/external/XNNPACK/src/qs8-vadd/ |
D | wasmsimd.c.in | 57 …v128_t vout${ABC[N:N+8]} = wasm_i16x8_add_saturate(wasm_i16x8_narrow_i32x4(vacc${ABC[N:N+4]}, vacc… 61 … v128_t vout${ABC[N:N+16]} = wasm_i8x16_narrow_i16x8(vout${ABC[N:N+8]}, vout${ABC[N+8:N+16]}); 63 …v128_t vout${ABC[N:N+8]}${ABC[N:N+8]} = wasm_i8x16_narrow_i16x8(vout${ABC[N:N+8]}, vout${ABC[N:N+8… 67 vout${ABC[N:N+16]} = wasm_i8x16_max(vout${ABC[N:N+16]}, voutput_min); 69 … vout${ABC[N:N+8]}${ABC[N:N+8]} = wasm_i8x16_max(vout${ABC[N:N+8]}${ABC[N:N+8]}, voutput_min); 73 vout${ABC[N:N+16]} = wasm_i8x16_min(vout${ABC[N:N+16]}, voutput_max); 75 … vout${ABC[N:N+8]}${ABC[N:N+8]} = wasm_i8x16_min(vout${ABC[N:N+8]}${ABC[N:N+8]}, voutput_max); 78 wasm_v128_store(output, vout${ABC[0:16]}); 80 *((double*) output) = wasm_f64x2_extract_lane(vout${ABC[0:8]}${ABC[0:8]}, 0); 83 wasm_v128_store(output + ${N}, vout${ABC[N:N+16]}); [all …]
|
D | avx2-mul32-ld64.c.in | 62 …__m256i vout${ABC[N:N+4]}${ABC[N+8:N+12]}${ABC[N+4:N+8]}${ABC[N+12:N+16]} = _mm256_adds_epi16(_mm2… 64 …__m128i vout${ABC[N:N+8]} = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc${ABC[N:N+8]… 66 …__m128i vout${ABC[N:N+8]} = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc${ABC[N:N+8]… 70 …vout${ABC[N:N+4]}${ABC[N+8:N+12]}${ABC[N+4:N+8]}${ABC[N+12:N+16]} = _mm256_min_epi16(_mm256_max_ep… 72 …vout${ABC[N:N+8]} = _mm_min_epi16(_mm_max_epi16(vout${ABC[N:N+8]}, _mm256_castsi256_si128(voutput_… 74 … vout${ABC[N:N+8]} = _mm_min_epi16(_mm_max_epi16(vout${ABC[N:N+8]}, voutput_min), voutput_max); 78 …vout${ABC[N:N+16]} = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(vout${ABC[N:N+4]}${A… 80 … __m128i vout${ABC[N:N+8]}${ABC[N:N+8]} = _mm_packs_epi16(vout${ABC[N:N+8]}, vout${ABC[N:N+8]}); 83 _mm_storeu_si128((__m128i*) output, vout${ABC[0:16]}); 85 _mm_storel_epi64((__m128i*) output, vout${ABC[0:8]}${ABC[0:8]}); [all …]
|
D | neon-ld64.c.in | 61 …int8x16_t vout${ABC[N:N+16]} = vcombine_s8(vqmovn_s16(vacc${ABC[N:N+8]}), vqmovn_s16(vacc${ABC[N+8… 63 int8x8_t vout${ABC[N:N+8]} = vqmovn_s16(vacc${ABC[N:N+8]}); 67 vout${ABC[N:N+16]} = vmaxq_s8(vout${ABC[N:N+16]}, voutput_min); 69 vout${ABC[N:N+8]} = vmax_s8(vout${ABC[N:N+8]}, vget_low_s8(voutput_min)); 73 vout${ABC[N:N+16]} = vminq_s8(vout${ABC[N:N+16]}, voutput_max); 75 vout${ABC[N:N+8]} = vmin_s8(vout${ABC[N:N+8]}, vget_low_s8(voutput_max)); 79 vst1q_s8(output, vout${ABC[N:N+16]}); output += 16; 81 vst1_s8(output, vout${ABC[N:N+8]}); output += 8; 109 int8x8_t vout${ABC[0:8]} = vqmovn_s16(vacc${ABC[0:8]}); 110 vout${ABC[0:8]} = vmax_s8(vout${ABC[0:8]}, vget_low_s8(voutput_min)); [all …]
|
D | sse-mul32-ld32.c.in | 72 …__m128i vout${ABC[N:N+8]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[N:N+4]}, vacc${ABC[N+4:N+8]})… 75 vout${ABC[N:N+8]} = _mm_max_epi16(vout${ABC[N:N+8]}, voutput_min); 78 vout${ABC[N:N+8]} = _mm_min_epi16(vout${ABC[N:N+8]}, voutput_max); 82 const __m128i vout${ABC[N:N+16]} = _mm_packs_epi16(vout${ABC[N:N+8]}, vout${ABC[N+8:N+16]}); 84 …const __m128i vout${ABC[N:N+8]}${ABC[N:N+8]} = _mm_packs_epi16(vout${ABC[N:N+8]}, vout${ABC[N:N+8]… 87 _mm_storeu_si128((__m128i*) output, vout${ABC[0:16]}); 89 _mm_storel_epi64((__m128i*) output, vout${ABC[0:8]}${ABC[0:8]}); 92 _mm_storeu_si128((__m128i*) (output + ${N}), vout${ABC[N:N+16]}); 94 _mm_storel_epi64((__m128i*) (output + ${N}), vout${ABC[N:N+8]}${ABC[N:N+8]}); 126 …__m128i vout${ABC[0:8]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[0:4]}, vacc${ABC[4:8]}), voutpu… [all …]
|
/external/XNNPACK/src/qs8-dwconv/ |
D | unipass-neon-mul16.c.in | 82 …int8x16_t vout${ABC[C:C+16]} = vqmovn_high_s16(vqmovn_s16(vacc${ABC[C:C+8]}), vacc${ABC[C+8:C+16]}… 84 int8x8_t vout${ABC[C:C+8]} = vqmovn_s16(vacc${ABC[C:C+8]}); 91 …int8x16_t vout${ABC[C:C+16]} = vcombine_s8(vqmovn_s16(vacc${ABC[C:C+8]}), vqmovn_s16(vacc${ABC[C+8… 93 int8x8_t vout${ABC[C:C+8]} = vqmovn_s16(vacc${ABC[C:C+8]}); 98 vout${ABC[C:C+16]} = vmaxq_s8(vout${ABC[C:C+16]}, voutput_min); 101 vout${ABC[C:C+8]} = vmax_s8(vout${ABC[C:C+8]}, voutput_min); 103 vout${ABC[C:C+8]} = vmax_s8(vout${ABC[C:C+8]}, vget_low_s8(voutput_min)); 107 vout${ABC[C:C+16]} = vminq_s8(vout${ABC[C:C+16]}, voutput_max); 110 vout${ABC[C:C+8]} = vmin_s8(vout${ABC[C:C+8]}, voutput_max); 112 vout${ABC[C:C+8]} = vmin_s8(vout${ABC[C:C+8]}, vget_low_s8(voutput_max)); [all …]
|
/external/XNNPACK/src/f32-gavgpool/ |
D | 7x-minmax-wasmsimd-x86-c4.c | 80 v128_t vout = wasm_f32x4_mul(vsum, vscale); in xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_x86_c4() local 81 const v128_t vufmask = wasm_f32x4_lt(vout, vmin); in xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_x86_c4() 82 const v128_t vofmask = wasm_f32x4_le(vmax, vout); in xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_x86_c4() 83 vout = wasm_v128_bitselect(vmin, vout, vufmask); in xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_x86_c4() 84 vout = wasm_v128_bitselect(vmax, vout, vofmask); in xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_x86_c4() 86 wasm_v128_store(output, vout); in xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_x86_c4() 109 v128_t vout = wasm_f32x4_mul(vsum, vscale); in xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_x86_c4() local 110 const v128_t vufmask = wasm_f32x4_lt(vout, vmin); in xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_x86_c4() 111 const v128_t vofmask = wasm_f32x4_le(vmax, vout); in xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_x86_c4() 112 vout = wasm_v128_bitselect(vmin, vout, vufmask); in xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_x86_c4() [all …]
|
D | 7x-minmax-sse-c4.c | 80 __m128 vout = _mm_mul_ps(vsum, vscale); in xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4() local 81 vout = _mm_max_ps(vout, vmin); in xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4() 82 vout = _mm_min_ps(vout, vmax); in xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4() 84 _mm_storeu_ps(output, vout); in xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4() 107 __m128 vout = _mm_mul_ps(vsum, vscale); in xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4() local 108 vout = _mm_max_ps(vout, vmin); in xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4() 109 vout = _mm_min_ps(vout, vmax); in xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4() 112 _mm_storel_pi((__m64*) output, vout); in xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4() 113 vout = _mm_movehl_ps(vout, vout); in xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4() 117 _mm_store_ss(output, vout); in xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4()
|
D | 7x-minmax-wasmsimd-arm-c4.c | 80 v128_t vout = wasm_f32x4_mul(vsum, vscale); in xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_arm_c4() local 81 vout = wasm_f32x4_max(vout, vmin); in xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_arm_c4() 82 vout = wasm_f32x4_min(vout, vmax); in xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_arm_c4() 84 wasm_v128_store(output, vout); in xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_arm_c4() 107 v128_t vout = wasm_f32x4_mul(vsum, vscale); in xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_arm_c4() local 108 vout = wasm_f32x4_max(vout, vmin); in xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_arm_c4() 109 vout = wasm_f32x4_min(vout, vmax); in xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_arm_c4() 112 *((double*) output) = wasm_f64x2_extract_lane(vout, 0); in xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_arm_c4() 114 vout = wasm_v32x4_shuffle(vout, vout, 2, 3, 2, 3); in xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_arm_c4() 117 *output++ = wasm_f32x4_extract_lane(vout, 0); in xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_arm_c4()
|
/external/XNNPACK/src/qs8-igemm/ |
D | MRxNRc4-neondot.c.in | 138 …int8x16_t vout${M}x${ABC[N:N+16]} = vqmovn_high_s16(vqmovn_s16(vacc${M}x${ABC[N:N+8]}), vacc${M}x$… 140 …int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vqmovn_high_s16(vqmovn_s16(vacc${M-1}x${AB… 142 int8x8_t vout${M}x${ABC[N:N+8]} = vqmovn_s16(vacc${M}x${ABC[N:N+8]}); 151 …int8x16_t vout${M}x${ABC[N:N+16]} = vcombine_s8(vqmovn_s16(vacc${M}x${ABC[N:N+8]}), vqmovn_s16(vac… 153 …int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vcombine_s8(vqmovn_s16(vacc${M-1}x${ABC[N:… 155 int8x8_t vout${M}x${ABC[N:N+8]} = vqmovn_s16(vacc${M}x${ABC[N:N+8]}); 167 vout${M}x${ABC[N:N+16]} = vmaxq_s8(vout${M}x${ABC[N:N+16]}, voutput_min); 169 …vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vmaxq_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}… 172 vout${M}x${ABC[N:N+8]} = vmax_s8(vout${M}x${ABC[N:N+8]}, voutput_min); 174 vout${M}x${ABC[N:N+8]} = vmax_s8(vout${M}x${ABC[N:N+8]}, vget_low_s8(voutput_min)); [all …]
|
D | c16-neon-mlal-padal.c.in | 141 …int8x16_t vout${M}x${ABC[N:N+16]} = vqmovn_high_s16(vqmovn_s16(vacc${M}x${ABC[N:N+8]}), vacc${M}x$… 143 …int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vqmovn_high_s16(vqmovn_s16(vacc${M-1}x${AB… 145 int8x8_t vout${M}x${ABC[N:N+8]} = vqmovn_s16(vacc${M}x${ABC[N:N+8]}); 154 …int8x16_t vout${M}x${ABC[N:N+16]} = vcombine_s8(vqmovn_s16(vacc${M}x${ABC[N:N+8]}), vqmovn_s16(vac… 156 …int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vcombine_s8(vqmovn_s16(vacc${M-1}x${ABC[N:… 158 int8x8_t vout${M}x${ABC[N:N+8]} = vqmovn_s16(vacc${M}x${ABC[N:N+8]}); 170 vout${M}x${ABC[N:N+16]} = vmaxq_s8(vout${M}x${ABC[N:N+16]}, voutput_min); 172 …vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vmaxq_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}… 175 vout${M}x${ABC[N:N+8]} = vmax_s8(vout${M}x${ABC[N:N+8]}, voutput_min); 177 vout${M}x${ABC[N:N+8]} = vmax_s8(vout${M}x${ABC[N:N+8]}, vget_low_s8(voutput_min)); [all …]
|
D | c8-neon-mull-padal.c.in | 159 …int8x16_t vout${M}x${ABC[N:N+16]} = vqmovn_high_s16(vqmovn_s16(vacc${M}x${ABC[N:N+8]}), vacc${M}x$… 161 …int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vqmovn_high_s16(vqmovn_s16(vacc${M-1}x${AB… 163 int8x8_t vout${M}x${ABC[N:N+8]} = vqmovn_s16(vacc${M}x${ABC[N:N+8]}); 172 …int8x16_t vout${M}x${ABC[N:N+16]} = vcombine_s8(vqmovn_s16(vacc${M}x${ABC[N:N+8]}), vqmovn_s16(vac… 174 …int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vcombine_s8(vqmovn_s16(vacc${M-1}x${ABC[N:… 176 int8x8_t vout${M}x${ABC[N:N+8]} = vqmovn_s16(vacc${M}x${ABC[N:N+8]}); 188 vout${M}x${ABC[N:N+16]} = vmaxq_s8(vout${M}x${ABC[N:N+16]}, voutput_min); 190 …vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vmaxq_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}… 193 vout${M}x${ABC[N:N+8]} = vmax_s8(vout${M}x${ABC[N:N+8]}, voutput_min); 195 vout${M}x${ABC[N:N+8]} = vmax_s8(vout${M}x${ABC[N:N+8]}, vget_low_s8(voutput_min)); [all …]
|
/external/XNNPACK/src/qs8-gemm/ |
D | MRxNRc4-neondot.c.in | 131 …int8x16_t vout${M}x${ABC[N:N+16]} = vqmovn_high_s16(vqmovn_s16(vacc${M}x${ABC[N:N+8]}), vacc${M}x$… 133 …int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vqmovn_high_s16(vqmovn_s16(vacc${M-1}x${AB… 135 int8x8_t vout${M}x${ABC[N:N+8]} = vqmovn_s16(vacc${M}x${ABC[N:N+8]}); 144 …int8x16_t vout${M}x${ABC[N:N+16]} = vcombine_s8(vqmovn_s16(vacc${M}x${ABC[N:N+8]}), vqmovn_s16(vac… 146 …int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vcombine_s8(vqmovn_s16(vacc${M-1}x${ABC[N:… 148 int8x8_t vout${M}x${ABC[N:N+8]} = vqmovn_s16(vacc${M}x${ABC[N:N+8]}); 160 vout${M}x${ABC[N:N+16]} = vmaxq_s8(vout${M}x${ABC[N:N+16]}, voutput_min); 162 …vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vmaxq_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}… 165 vout${M}x${ABC[N:N+8]} = vmax_s8(vout${M}x${ABC[N:N+8]}, voutput_min); 167 vout${M}x${ABC[N:N+8]} = vmax_s8(vout${M}x${ABC[N:N+8]}, vget_low_s8(voutput_min)); [all …]
|
D | c16-neon-mlal-padal.c.in | 130 …int8x16_t vout${M}x${ABC[N:N+16]} = vqmovn_high_s16(vqmovn_s16(vacc${M}x${ABC[N:N+8]}), vacc${M}x$… 132 …int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vqmovn_high_s16(vqmovn_s16(vacc${M-1}x${AB… 134 int8x8_t vout${M}x${ABC[N:N+8]} = vqmovn_s16(vacc${M}x${ABC[N:N+8]}); 143 …int8x16_t vout${M}x${ABC[N:N+16]} = vcombine_s8(vqmovn_s16(vacc${M}x${ABC[N:N+8]}), vqmovn_s16(vac… 145 …int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vcombine_s8(vqmovn_s16(vacc${M-1}x${ABC[N:… 147 int8x8_t vout${M}x${ABC[N:N+8]} = vqmovn_s16(vacc${M}x${ABC[N:N+8]}); 159 vout${M}x${ABC[N:N+16]} = vmaxq_s8(vout${M}x${ABC[N:N+16]}, voutput_min); 161 …vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vmaxq_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}… 164 vout${M}x${ABC[N:N+8]} = vmax_s8(vout${M}x${ABC[N:N+8]}, voutput_min); 166 vout${M}x${ABC[N:N+8]} = vmax_s8(vout${M}x${ABC[N:N+8]}, vget_low_s8(voutput_min)); [all …]
|
D | c8-neon-mull-padal.c.in | 148 …int8x16_t vout${M}x${ABC[N:N+16]} = vqmovn_high_s16(vqmovn_s16(vacc${M}x${ABC[N:N+8]}), vacc${M}x$… 150 …int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vqmovn_high_s16(vqmovn_s16(vacc${M-1}x${AB… 152 int8x8_t vout${M}x${ABC[N:N+8]} = vqmovn_s16(vacc${M}x${ABC[N:N+8]}); 161 …int8x16_t vout${M}x${ABC[N:N+16]} = vcombine_s8(vqmovn_s16(vacc${M}x${ABC[N:N+8]}), vqmovn_s16(vac… 163 …int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vcombine_s8(vqmovn_s16(vacc${M-1}x${ABC[N:… 165 int8x8_t vout${M}x${ABC[N:N+8]} = vqmovn_s16(vacc${M}x${ABC[N:N+8]}); 177 vout${M}x${ABC[N:N+16]} = vmaxq_s8(vout${M}x${ABC[N:N+16]}, voutput_min); 179 …vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vmaxq_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}… 182 vout${M}x${ABC[N:N+8]} = vmax_s8(vout${M}x${ABC[N:N+8]}, voutput_min); 184 vout${M}x${ABC[N:N+8]} = vmax_s8(vout${M}x${ABC[N:N+8]}, vget_low_s8(voutput_min)); [all …]
|
/external/XNNPACK/src/f32-avgpool/ |
D | 9x-minmax-wasmsimd-x86-c4.c | 136 v128_t vout = wasm_f32x4_mul(vsum, vscale); in xnn_f32_avgpool_minmax_ukernel_9x__wasmsimd_x86_c4() local 137 const v128_t vufmask = wasm_f32x4_lt(vout, vmin); in xnn_f32_avgpool_minmax_ukernel_9x__wasmsimd_x86_c4() 138 const v128_t vofmask = wasm_f32x4_le(vmax, vout); in xnn_f32_avgpool_minmax_ukernel_9x__wasmsimd_x86_c4() 139 vout = wasm_v128_bitselect(vmin, vout, vufmask); in xnn_f32_avgpool_minmax_ukernel_9x__wasmsimd_x86_c4() 140 vout = wasm_v128_bitselect(vmax, vout, vofmask); in xnn_f32_avgpool_minmax_ukernel_9x__wasmsimd_x86_c4() 142 wasm_v128_store(output, vout); in xnn_f32_avgpool_minmax_ukernel_9x__wasmsimd_x86_c4() 167 v128_t vout = wasm_f32x4_mul(vsum, vscale); in xnn_f32_avgpool_minmax_ukernel_9x__wasmsimd_x86_c4() local 168 const v128_t vufmask = wasm_f32x4_lt(vout, vmin); in xnn_f32_avgpool_minmax_ukernel_9x__wasmsimd_x86_c4() 169 const v128_t vofmask = wasm_f32x4_le(vmax, vout); in xnn_f32_avgpool_minmax_ukernel_9x__wasmsimd_x86_c4() 170 vout = wasm_v128_bitselect(vmin, vout, vufmask); in xnn_f32_avgpool_minmax_ukernel_9x__wasmsimd_x86_c4() [all …]
|
/external/XNNPACK/src/f32-pavgpool/ |
D | 9x-minmax-wasmsimd-x86-c4.c | 139 v128_t vout = wasm_f32x4_mul(vsum, vmultiplier); in xnn_f32_pavgpool_minmax_ukernel_9x__wasmsimd_x86_c4() local 140 const v128_t vufmask = wasm_f32x4_lt(vout, vmin); in xnn_f32_pavgpool_minmax_ukernel_9x__wasmsimd_x86_c4() 141 const v128_t vofmask = wasm_f32x4_le(vmax, vout); in xnn_f32_pavgpool_minmax_ukernel_9x__wasmsimd_x86_c4() 142 vout = wasm_v128_bitselect(vmin, vout, vufmask); in xnn_f32_pavgpool_minmax_ukernel_9x__wasmsimd_x86_c4() 143 vout = wasm_v128_bitselect(vmax, vout, vofmask); in xnn_f32_pavgpool_minmax_ukernel_9x__wasmsimd_x86_c4() 145 wasm_v128_store(output, vout); in xnn_f32_pavgpool_minmax_ukernel_9x__wasmsimd_x86_c4() 170 v128_t vout = wasm_f32x4_mul(vsum, vmultiplier); in xnn_f32_pavgpool_minmax_ukernel_9x__wasmsimd_x86_c4() local 171 const v128_t vufmask = wasm_f32x4_lt(vout, vmin); in xnn_f32_pavgpool_minmax_ukernel_9x__wasmsimd_x86_c4() 172 const v128_t vofmask = wasm_f32x4_le(vmax, vout); in xnn_f32_pavgpool_minmax_ukernel_9x__wasmsimd_x86_c4() 173 vout = wasm_v128_bitselect(vmin, vout, vufmask); in xnn_f32_pavgpool_minmax_ukernel_9x__wasmsimd_x86_c4() [all …]
|
/external/XNNPACK/src/u8-maxpool/ |
D | 9p8x-minmax-sse2-c16.c | 100 const __m128i vout = _mm_max_epu8(_mm_min_epu8(vmax, voutput_max), voutput_min); in xnn_u8_maxpool_minmax_ukernel_9p8x__sse2_c16() local 102 _mm_storeu_si128((__m128i*) o, vout); o += 16; in xnn_u8_maxpool_minmax_ukernel_9p8x__sse2_c16() 123 __m128i vout = _mm_max_epu8(_mm_min_epu8(vmax, voutput_max), voutput_min); in xnn_u8_maxpool_minmax_ukernel_9p8x__sse2_c16() local 126 _mm_storel_epi64((__m128i*) o, vout); in xnn_u8_maxpool_minmax_ukernel_9p8x__sse2_c16() 127 vout = _mm_unpackhi_epi64(vout, vout); in xnn_u8_maxpool_minmax_ukernel_9p8x__sse2_c16() 131 *((uint32_t*) o) = (uint32_t) _mm_cvtsi128_si32(vout); in xnn_u8_maxpool_minmax_ukernel_9p8x__sse2_c16() 132 vout = _mm_srli_epi64(vout, 32); in xnn_u8_maxpool_minmax_ukernel_9p8x__sse2_c16() 136 *((uint16_t*) o) = (uint16_t) _mm_extract_epi16(vout, 0); in xnn_u8_maxpool_minmax_ukernel_9p8x__sse2_c16() 137 vout = _mm_srli_epi32(vout, 16); in xnn_u8_maxpool_minmax_ukernel_9p8x__sse2_c16() 141 *((uint8_t*) o) = (uint8_t) _mm_cvtsi128_si32(vout); in xnn_u8_maxpool_minmax_ukernel_9p8x__sse2_c16() [all …]
|
/external/XNNPACK/src/qu8-gavgpool/ |
D | 7x-minmax-sse2-c8.c | 120 __m128i vout = _mm_packs_epi32(vscaled_lo, vscaled_hi); in xnn_qu8_gavgpool_minmax_ukernel_7x__sse2_c8() local 121 vout = _mm_adds_epi16(vout, _mm_load_si128((const __m128i*) params->sse2.output_zero_point)); in xnn_qu8_gavgpool_minmax_ukernel_7x__sse2_c8() 122 vout = _mm_packus_epi16(vout, vout); in xnn_qu8_gavgpool_minmax_ukernel_7x__sse2_c8() 123 vout = _mm_min_epu8(vout, _mm_load_si128((const __m128i*) params->sse2.output_max)); in xnn_qu8_gavgpool_minmax_ukernel_7x__sse2_c8() 124 vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->sse2.output_min)); in xnn_qu8_gavgpool_minmax_ukernel_7x__sse2_c8() 126 _mm_storel_epi64((__m128i*) output, vout); output += 8; in xnn_qu8_gavgpool_minmax_ukernel_7x__sse2_c8() 189 __m128i vout = _mm_packs_epi32(vscaled_lo, vscaled_hi); in xnn_qu8_gavgpool_minmax_ukernel_7x__sse2_c8() local 190 vout = _mm_adds_epi16(vout, _mm_load_si128((const __m128i*) params->sse2.output_zero_point)); in xnn_qu8_gavgpool_minmax_ukernel_7x__sse2_c8() 191 vout = _mm_packus_epi16(vout, vout); in xnn_qu8_gavgpool_minmax_ukernel_7x__sse2_c8() 192 vout = _mm_min_epu8(vout, _mm_load_si128((const __m128i*) params->sse2.output_max)); in xnn_qu8_gavgpool_minmax_ukernel_7x__sse2_c8() [all …]
|