/external/XNNPACK/src/f32-qs8-vcvt/ |
D | avx512skx.c.in | 10 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 52 __m512 vx${ABC[N:N+4]} = _mm512_loadu_ps(x + ${N * 4}); 56 vx${ABC[N:N+4]} = _mm512_mul_ps(vx${ABC[N:N+4]}, vscale); 59 vx${ABC[N:N+4]} = _mm512_min_ps(vx${ABC[N:N+4]}, voutput_max_less_zero_point); 62 const __m512i vacc${ABC[N:N+4]} = _mm512_cvtps_epi32(vx${ABC[N:N+4]}); 65 …2i vacc${ABC[N]}${ABC[N+4]}${ABC[N+1]}${ABC[N+5]}${ABC[N+2]}${ABC[N+6]}${ABC[N+3]}${ABC[N+7]} = _m… 68 …ABC[N]}${ABC[N+4]}${ABC[N+1]}${ABC[N+5]}${ABC[N+2]}${ABC[N+6]}${ABC[N+3]}${ABC[N+7]} = _mm512_adds… 72 …ABC[N]}${ABC[N+4]}${ABC[N+8]}${ABC[N+12]}${ABC[N+1]}${ABC[N+5]}${ABC[N+9]}${ABC[N+13]}${ABC[N+2]}$… 74 …ABC[N]}${ABC[N+4]}${ABC[N+2]}${ABC[N+6]}${ABC[N+1]}${ABC[N+5]}${ABC[N+3]}${ABC[N+7]} = ${_MM256_PA… 78 …ABC[N]}${ABC[N+4]}${ABC[N+8]}${ABC[N+12]}${ABC[N+1]}${ABC[N+5]}${ABC[N+9]}${ABC[N+13]}${ABC[N+2]}$… [all …]
|
/external/XNNPACK/src/qs8-vadd/ |
D | sse-mul16-ld64.c.in | 12 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 47 const __m128i va${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) input_a)); 48 const __m128i vb${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) input_b)); 50 …const __m128i va${ABC[N:N+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (input_a + $… 51 …const __m128i vb${ABC[N:N+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (input_b + $… 53 __m128i va${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) input_a); 54 __m128i vb${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) input_b); 56 __m128i va${ABC[N:N+8]} = _mm_loadl_epi64((const __m128i*) (input_a + ${N})); 57 __m128i vb${ABC[N:N+8]} = _mm_loadl_epi64((const __m128i*) (input_b + ${N})); 65 va${ABC[N:N+8]} = _mm_unpacklo_epi8(va${ABC[N:N+8]}, vzero); [all …]
|
D | avx2-mul32-ld64.c.in | 9 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 42 … const __m256i va${ABC[0:8]} = ${_MM256_CVTEPX8_EPI32}(_mm_loadl_epi64((const __m128i*) input_a)); 43 … const __m256i vb${ABC[0:8]} = ${_MM256_CVTEPX8_EPI32}(_mm_loadl_epi64((const __m128i*) input_b)); 45 …const __m256i va${ABC[N:N+8]} = ${_MM256_CVTEPX8_EPI32}(_mm_loadl_epi64((const __m128i*) (input_a … 46 …const __m256i vb${ABC[N:N+8]} = ${_MM256_CVTEPX8_EPI32}(_mm_loadl_epi64((const __m128i*) (input_b … 51 …__m256i vacc${ABC[N:N+8]} = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va${ABC[N:N+8]}, va_multipl… 54 …vacc${ABC[N:N+8]} = _mm256_add_epi32(vacc${ABC[N:N+8]}, _mm256_mullo_epi32(vb${ABC[N:N+8]}, vb_mul… 57 vacc${ABC[N:N+8]} = _mm256_sra_epi32(vacc${ABC[N:N+8]}, vshift); 61 …_m256i vout${ABC[N:N+4]}${ABC[N+8:N+12]}${ABC[N+4:N+8]}${ABC[N+12:N+16]} = _mm256_adds_epi16(_mm25… 63 …__m128i vout${ABC[N:N+8]} = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc${ABC[N:N+8]… [all …]
|
D | sse-mul32-ld32.c.in | 11 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 52 …const __m128i va${ABC[0:4]} = ${_MM_CVTEPX8_EPI32}(_mm_cvtsi32_si128((int) unaligned_load_s32(inpu… 53 …const __m128i vb${ABC[0:4]} = ${_MM_CVTEPX8_EPI32}(_mm_cvtsi32_si128((int) unaligned_load_s32(inpu… 55 …const __m128i va${ABC[N:N+4]} = ${_MM_CVTEPX8_EPI32}(_mm_cvtsi32_si128((int) unaligned_load_s32(in… 56 …const __m128i vb${ABC[N:N+4]} = ${_MM_CVTEPX8_EPI32}(_mm_cvtsi32_si128((int) unaligned_load_s32(in… 62 __m128i vacc${ABC[N:N+4]} = _mm_macc_epi32(va${ABC[N:N+4]}, va_multiplier, vbias); 65 vacc${ABC[N:N+4]} = _mm_macc_epi32(vb${ABC[N:N+4]}, vb_multiplier, vacc${ABC[N:N+4]}); 68 … __m128i vacc${ABC[N:N+4]} = _mm_add_epi32(vbias, _mm_mullo_epi32(va${ABC[N:N+4]}, va_multiplier)); 71 …vacc${ABC[N:N+4]} = _mm_add_epi32(vacc${ABC[N:N+4]}, _mm_mullo_epi32(vb${ABC[N:N+4]}, vb_multiplie… 74 vacc${ABC[N:N+4]} = _mm_sra_epi32(vacc${ABC[N:N+4]}, vshift); [all …]
|
D | wasmsimd.c.in | 9 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 40 const v128_t va${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(input_a); 41 const v128_t vb${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(input_b); 43 const v128_t va${ABC[N:N+8]} = ${WASM_X16X8_LOAD8X8}(input_a + ${N}); 44 const v128_t vb${ABC[N:N+8]} = ${WASM_X16X8_LOAD8X8}(input_b + ${N}); 49 …v128_t vacc${ABC[N:N+4]} = wasm_i32x4_add(vbias, wasm_i32x4_mul(${WASM_X32X4_EXTEND_LOW_X16X8}(va$… 50 …v128_t vacc${ABC[N+4:N+8]} = wasm_i32x4_add(vbias, wasm_i32x4_mul(${WASM_X32X4_EXTEND_HIGH_X16X8}(… 53 …vacc${ABC[N:N+4]} = wasm_i32x4_add(vacc${ABC[N:N+4]}, wasm_i32x4_mul(${WASM_X32X4_EXTEND_LOW_X16X8… 54 …vacc${ABC[N+4:N+8]} = wasm_i32x4_add(vacc${ABC[N+4:N+8]}, wasm_i32x4_mul(${WASM_X32X4_EXTEND_HIGH_… 57 vacc${ABC[N:N+4]} = wasm_i32x4_shr(vacc${ABC[N:N+4]}, vshift); [all …]
|
D | avx512skx-mul32-ld128.c.in | 9 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 47 … const __m512i va${ABC[0:16]} = ${_MM512_CVTEPX8_EPI32}(_mm_loadu_si128((const __m128i*) input_a)); 48 … const __m512i vb${ABC[0:16]} = ${_MM512_CVTEPX8_EPI32}(_mm_loadu_si128((const __m128i*) input_b)); 50 …const __m512i va${ABC[N:N+16]} = ${_MM512_CVTEPX8_EPI32}(_mm_loadu_si128((const __m128i*) (input_a… 51 …const __m512i vb${ABC[N:N+16]} = ${_MM512_CVTEPX8_EPI32}(_mm_loadu_si128((const __m128i*) (input_b… 56 …__m512i vacc${ABC[N:N+16]} = _mm512_add_epi32(vbias, _mm512_mullo_epi32(va${ABC[N:N+16]}, va_multi… 59 …vacc${ABC[N:N+16]} = _mm512_add_epi32(vacc${ABC[N:N+16]}, _mm512_mullo_epi32(vb${ABC[N:N+16]}, vb_… 62 vacc${ABC[N:N+16]} = _mm512_sra_epi32(vacc${ABC[N:N+16]}, vshift); 66 …ABC[N:N+4]}${ABC[N+16:N+20]}${ABC[N+4:N+8]}${ABC[N+20:N+24]}${ABC[N+8:N+12]}${ABC[N+24:N+28]}${ABC… 68 …ABC[N:N+4]}${ABC[N+8:N+12]}${ABC[N+4:N+8]}${ABC[N+12:N+16]} = _mm256_adds_epi16(_mm256_packs_epi32… [all …]
|
/external/XNNPACK/src/qs8-vmul/ |
D | sse-mul16-ld64.c.in | 13 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 46 const __m128i va${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) input_a)); 47 const __m128i vb${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) input_b)); 49 …const __m128i va${ABC[N:N+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (input_a + $… 50 …const __m128i vb${ABC[N:N+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (input_b + $… 52 __m128i va${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) input_a); 53 __m128i vb${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) input_b); 55 __m128i va${ABC[N:N+8]} = _mm_loadl_epi64((const __m128i*) (input_a + ${N})); 56 __m128i vb${ABC[N:N+8]} = _mm_loadl_epi64((const __m128i*) (input_b + ${N})); 64 va${ABC[N:N+8]} = _mm_unpacklo_epi8(va${ABC[N:N+8]}, vzero); [all …]
|
D | wasmsimd-mul32-ld64.c.in | 10 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 42 const v128_t va${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(input_a); 43 const v128_t vb${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(input_b); 45 const v128_t va${ABC[N:N+8]} = ${WASM_X16X8_LOAD8X8}(input_a + ${N}); 46 const v128_t vb${ABC[N:N+8]} = ${WASM_X16X8_LOAD8X8}(input_b + ${N}); 51 const v128_t vxa${ABC[N:N+8]} = wasm_i16x8_sub(va${ABC[N:N+8]}, va_zero_point); 52 const v128_t vxb${ABC[N:N+8]} = wasm_i16x8_sub(vb${ABC[N:N+8]}, vb_zero_point); 55 …v128_t vacc${ABC[N:N+4]} = wasm_i32x4_mul(wasm_i32x4_extend_low_i16x8(vxa${ABC[N:N+8]}), wasm_i32x… 56 …v128_t vacc${ABC[N+4:N+8]} = wasm_i32x4_mul(wasm_i32x4_extend_high_i16x8(vxa${ABC[N:N+8]}), wasm_i… 59 vacc${ABC[N:N+4]} = wasm_f32x4_convert_i32x4(vacc${ABC[N:N+4]}); [all …]
|
D | neon.c.in | 10 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 82 const ${XINT8X16_T} va${ABC[N:N+16]} = ${VLD1Q_X8}(input_a); input_a += 16; 83 const ${XINT8X16_T} vb${ABC[N:N+16]} = ${VLD1Q_X8}(input_b); input_b += 16; 88 …const int16x8_t vxa${ABC[N:N+8]} = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(va${ABC[N:N+16]}), v… 89 …const int16x8_t vxa${ABC[N+8:N+16]} = vreinterpretq_s16_u16(vsubl_high_u8(va${ABC[N:N+16]}, va_zer… 90 …const int16x8_t vxb${ABC[N:N+8]} = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(vb${ABC[N:N+16]}), v… 91 …const int16x8_t vxb${ABC[N+8:N+16]} = vreinterpretq_s16_u16(vsubl_high_u8(vb${ABC[N:N+16]}, vb_zer… 93 …const int16x8_t vxa${ABC[N:N+8]} = vsubl_s8(vget_low_s8(va${ABC[N:N+16]}), vget_low_s8(va_zero_poi… 94 const int16x8_t vxa${ABC[N+8:N+16]} = vsubl_high_s8(va${ABC[N:N+16]}, va_zero_point); 95 …const int16x8_t vxb${ABC[N:N+8]} = vsubl_s8(vget_low_s8(vb${ABC[N:N+16]}), vget_low_s8(vb_zero_poi… [all …]
|
/external/XNNPACK/src/qs8-gavgpool/ |
D | multipass-wasmsimd.c.in | 13 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 54 const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M}); 56 const v128_t vxi${M}x${ABC[C:C+8]} = ${WASM_X16X8_LOAD8X8}(i${M} + ${C}); 59 v128_t vacc${ABC[0:8]} = wasm_i16x8_add(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]}); 60 const v128_t vxi2x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i2); 62 v128_t vacc${ABC[C:C+8]} = wasm_i16x8_add(vxi0x${ABC[C:C+8]}, vxi1x${ABC[C:C+8]}); 63 const v128_t vxi2x${ABC[C:C+8]} = ${WASM_X16X8_LOAD8X8}(i2 + ${C}); 67 vacc${ABC[0:8]} = wasm_i16x8_add(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]}); 68 const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M}); 70 vacc${ABC[C:C+8]} = wasm_i16x8_add(vacc${ABC[C:C+8]}, vxi${M-1}x${ABC[C:C+8]}); [all …]
|
D | multipass-sse2.c.in | 13 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 56 … __m128i vacc${ABC[C:C+8]} = _mm_add_epi16(vxi${M-3}x${ABC[C:C+8]}, vxi${M-2}x${ABC[C:C+8]}); 58 vacc${ABC[C:C+8]} = _mm_add_epi16(vacc${ABC[C:C+8]}, vxi${M-2}x${ABC[C:C+8]}); 61 …const __m128i vxi${M-1}x${ABC[C:C+8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vi${M-1}x${ABC[C:C+8]}, v… 63 … const __m128i vxi${M-1}x${ABC[C:C+8]} = _mm_unpacklo_epi8(vi${M-1}x${ABC[C:C+8]}, vzero); 66 const __m128i vi${M}x${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) i${M}); 68 const __m128i vi${M}x${ABC[C:C+8]} = _mm_loadl_epi64((const __m128i*) (i${M} + ${C})); 74 … const __m128i vsgnacc${ABC[C:C+8]} = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc${ABC[C:C+8]}); 75 __m128i vacc${ABC[C:C+4]} = _mm_unpacklo_epi16(vacc${ABC[C:C+8]}, vsgnacc${ABC[C:C+8]}); 76 __m128i vacc${ABC[C+4:C+8]} = _mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vsgnacc${ABC[C:C+8]}); [all …]
|
D | multipass-sse4.c.in | 13 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 54 … const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M})); 56 …const __m128i vxi${M}x${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i${M}… 59 __m128i vacc${ABC[0:8]} = _mm_add_epi16(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]}); 60 const __m128i vxi2x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i2)); 62 __m128i vacc${ABC[C:C+8]} = _mm_add_epi16(vxi0x${ABC[C:C+8]}, vxi1x${ABC[C:C+8]}); 63 …const __m128i vxi2x${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i2 + ${C… 67 vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]}); 68 … const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M})); 70 vacc${ABC[C:C+8]} = _mm_add_epi16(vacc${ABC[C:C+8]}, vxi${M-1}x${ABC[C:C+8]}); [all …]
|
D | unipass-sse2.c.in | 11 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 59 … __m128i vacc${ABC[C:C+8]} = _mm_add_epi16(vxi${M-3}x${ABC[C:C+8]}, vxi${M-2}x${ABC[C:C+8]}); 61 vacc${ABC[C:C+8]} = _mm_add_epi16(vacc${ABC[C:C+8]}, vxi${M-2}x${ABC[C:C+8]}); 64 …const __m128i vxi${M-1}x${ABC[C:C+8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vi${M-1}x${ABC[C:C+8]}, v… 66 … const __m128i vxi${M-1}x${ABC[C:C+8]} = _mm_unpacklo_epi8(vi${M-1}x${ABC[C:C+8]}, vzero); 69 const __m128i vi${M}x${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) i${M}); 71 const __m128i vi${M}x${ABC[C:C+8]} = _mm_loadl_epi64((const __m128i*) (i${M} + ${C})); 77 … const __m128i vsgnacc${ABC[C:C+8]} = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc${ABC[C:C+8]}); 78 __m128i vacc${ABC[C:C+4]} = _mm_unpacklo_epi16(vacc${ABC[C:C+8]}, vsgnacc${ABC[C:C+8]}); 79 __m128i vacc${ABC[C+4:C+8]} = _mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vsgnacc${ABC[C:C+8]}); [all …]
|
D | unipass-sse4.c.in | 11 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 57 … const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M})); 59 …const __m128i vxi${M}x${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i${M}… 62 __m128i vacc${ABC[0:8]} = _mm_add_epi16(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]}); 63 const __m128i vxi2x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i2)); 65 __m128i vacc${ABC[C:C+8]} = _mm_add_epi16(vxi0x${ABC[C:C+8]}, vxi1x${ABC[C:C+8]}); 66 …const __m128i vxi2x${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i2 + ${C… 70 vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]}); 71 … const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M})); 73 vacc${ABC[C:C+8]} = _mm_add_epi16(vacc${ABC[C:C+8]}, vxi${M-1}x${ABC[C:C+8]}); [all …]
|
D | unipass-wasmsimd.c.in | 11 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 58 const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M}); 60 const v128_t vxi${M}x${ABC[C:C+8]} = ${WASM_X16X8_LOAD8X8}(i${M} + ${C}); 63 v128_t vacc${ABC[0:8]} = wasm_i16x8_add(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]}); 64 const v128_t vxi2x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i2); 66 v128_t vacc${ABC[C:C+8]} = wasm_i16x8_add(vxi0x${ABC[C:C+8]}, vxi1x${ABC[C:C+8]}); 67 const v128_t vxi2x${ABC[C:C+8]} = ${WASM_X16X8_LOAD8X8}(i2 + ${C}); 71 vacc${ABC[0:8]} = wasm_i16x8_add(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]}); 72 const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M}); 74 vacc${ABC[C:C+8]} = wasm_i16x8_add(vacc${ABC[C:C+8]}, vxi${M-1}x${ABC[C:C+8]}); [all …]
|
D | multipass-neon.c.in | 13 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 76 const ${XINT8X8_T} vi${M}x${ABC[C:C+8]} = ${VLD1_X8}(i${M}); i${M} += 8; 79 const ${XINT8X8_T} vi2x${ABC[C:C+8]} = ${VLD1_X8}(i2); i2 += 8; 80 ${XINT16X8_T} vsum${ABC[C:C+8]} = ${VADDL_X8}(vi0x${ABC[C:C+8]}, vi1x${ABC[C:C+8]}); 85 const ${XINT8X8_T} vi${M+1}x${ABC[C:C+8]} = ${VLD1_X8}(i${M+1}); i${M+1} += 8; 86 vsum${ABC[C:C+8]} = ${VADDW_X8}(vsum${ABC[C:C+8]}, vi${M}x${ABC[C:C+8]}); 90 const int32x4_t vacc${ABC[C:C+4]} = vaddw_s16(vinit_bias, vget_low_s16(vsum${ABC[C:C+8]})); 91 … const int32x4_t vacc${ABC[C+4:C+8]} = vaddw_s16(vinit_bias, vget_high_s16(vsum${ABC[C:C+8]})); 93 …const int32x4_t vacc${ABC[C:C+4]} = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bi… 94 …const int32x4_t vacc${ABC[C+4:C+8]} = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_… [all …]
|
/external/XNNPACK/src/qs8-dwconv/ |
D | unipass-avx2-mul16-vpunpck.c.in | 6 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 49 __m256i vacc${ABC[0:8]} = _mm256_loadu_si256((const __m256i*) w); 51 …__m256i vacc${ABC[C:C+8]} = _mm256_loadu_si256((const __m256i*) ((uintptr_t) w + ${C} * sizeof(int… 54 …__m256i vacc${ABC[C:C+4]}${ABC[C+8:C+12]} = _mm256_inserti128_si256(vacc${ABC[C:C+8]}, _mm256_cast… 55 …__m256i vacc${ABC[C+4:C+8]}${ABC[C+12:C+16]} = _mm256_permute2x128_si256(vacc${ABC[C:C+8]}, vacc${… 61 … const __m256i vi${K}x${ABC[0:16]} = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) i${K})); 63 …const __m256i vi${K}x${ABC[C:C+16]} = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) (i${K}… 64 …const __m256i vk${K}x${ABC[C:C+16]} = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) ((uint… 70 … __m256i vacc${ABC[C:C+16]} = _mm256_mullo_epi16(vi${K}x${ABC[C:C+16]}, vk${K}x${ABC[C:C+16]}); 72 vacc${ABC[C:C+16]} = _mm256_mullo_epi16(vi${K}x${ABC[C:C+16]}, vk${K}x${ABC[C:C+16]}); [all …]
|
D | unipass-sse-mul16.c.in | 11 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 65 __m128i vacc${ABC[0:4]} = _mm_loadu_si128((const __m128i*) w); 67 __m128i vacc${ABC[C:C+4]} = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + ${C})); 73 const __m128i vi${K}x${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) i${K}); 75 const __m128i vi${K}x${ABC[C:C+8]} = _mm_loadl_epi64((const __m128i*) (i${K} + ${C})); 78 const __m128i vxi${K}x${ABC[C:C+8]} = _mm_cvtepu8_epi16(vi${K}x${ABC[C:C+8]}); 80 const __m128i vxi${K}x${ABC[C:C+8]} = _mm_cvtepi8_epi16(vi${K}x${ABC[C:C+8]}); 81 …const __m128i vk${K}x${ABC[C:C+8]} = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + ${CHANNEL_T… 84 …const __m128i vxk${K}x${ABC[C:C+8]} = _mm_sub_epi16(_mm_cvtepu8_epi16(vk${K}x${ABC[C:C+8]}), vk_ze… 86 const __m128i vxk${K}x${ABC[C:C+8]} = _mm_cvtepi8_epi16(vk${K}x${ABC[C:C+8]}); [all …]
|
D | unipass-avx512skx-mul32.c.in | 6 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 70 __m512i vacc${ABC[0:16]} = _mm512_loadu_si512(w); 72 …__m512i vacc${ABC[C:C+16]} = _mm512_loadu_si512((const void*) ((uintptr_t) w + ${C} * sizeof(int32… 78 …const __m512i vi${K}x${ABC[0:16]} = ${_MM512_CVTEPX8_EPI32}(_mm_loadu_si128((const __m128i*) i${K}… 80 …const __m512i vi${K}x${ABC[C:C+16]} = ${_MM512_CVTEPX8_EPI32}(_mm_loadu_si128((const __m128i*) (i$… 82 …const __m512i vk${K}x${ABC[C:C+16]} = _mm512_sub_epi32(_mm512_cvtepu8_epi32(_mm_load_si128((const … 84 …const __m512i vk${K}x${ABC[C:C+16]} = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintp… 88 …vacc${ABC[C:C+16]} = _mm512_add_epi32(vacc${ABC[C:C+16]}, _mm512_mullo_epi32(vi${K}x${ABC[C:C+16]}… 93 __m512 vscaled${ABC[C:C+16]} = _mm512_cvtepi32_ps(vacc${ABC[C:C+16]}); 96 const __m512 vscale${ABC[0:16]} = _mm512_loadu_ps(w); [all …]
|
D | unipass-wasmsimd-mul16.c.in | 6 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 57 v128_t vacc${ABC[0:4]} = wasm_v128_load(w); 59 … v128_t vacc${ABC[C:C+4]} = wasm_v128_load((const void*) ((uintptr_t) w + ${C} * sizeof(int32_t))); 65 const v128_t vi${K}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${K}); 67 const v128_t vi${K}x${ABC[C:C+8]} = ${WASM_X16X8_LOAD8X8}(i${K} + ${C}); 68 …const v128_t vk${K}x${ABC[C:C+8]} = ${WASM_X16X8_LOAD8X8}((const void*) ((uintptr_t) w + ${CHANNEL… 72 v128_t vsumx${ABC[C:C+8]} = wasm_i16x8_add(vi0x${ABC[C:C+8]}, vi1x${ABC[C:C+8]}); 74 vsumx${ABC[C:C+8]} = wasm_i16x8_add(vsumx${ABC[C:C+8]}, vi${K}x${ABC[C:C+8]}); 79 v128_t vprod${ABC[C:C+8]} = wasm_i16x8_mul(vi${K}x${ABC[C:C+8]}, vk${K}x${ABC[C:C+8]}); 81 vprod${ABC[C:C+8]} = wasm_i16x8_mul(vi${K}x${ABC[C:C+8]}, vk${K}x${ABC[C:C+8]}); [all …]
|
D | unipass-neon-mul8.c.in | 9 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 72 int32x4_t vacc${ABC[C:C+4]} = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4); 77 const int8x16_t vi${K}x${ABC[C:C+16]} = vld1q_s8(i${K}); i${K} += 16; 78 … const int8x16_t vk${K}x${ABC[C:C+16]} = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16); 82 …int16x8_t vprod${ABC[C:C+8]} = vmull_s8(vget_low_s8(vi${K}x${ABC[C:C+16]}), vget_low_s8(vk${K}x${A… 83 …int16x8_t vprod${ABC[C+8:C+16]} = vmull_s8(vget_high_s8(vi${K}x${ABC[C:C+16]}), vget_high_s8(vk${K… 86 …vprod${ABC[C:C+8]} = vmull_s8(vget_low_s8(vi${K}x${ABC[C:C+16]}), vget_low_s8(vk${K}x${ABC[C:C+16]… 87 …vprod${ABC[C+8:C+16]} = vmull_s8(vget_high_s8(vi${K}x${ABC[C:C+16]}), vget_high_s8(vk${K}x${ABC[C:… 90 …vprod${ABC[C:C+8]} = vmlal_s8(vprod${ABC[C:C+8]}, vget_low_s8(vi${K}x${ABC[C:C+16]}), vget_low_s8(… 91 …vprod${ABC[C+8:C+16]} = vmlal_s8(vprod${ABC[C+8:C+16]}, vget_high_s8(vi${K}x${ABC[C:C+16]}), vget_… [all …]
|
/external/XNNPACK/src/qs8-vmulc/ |
D | sse-mul16-ld64.c.in | 13 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 48 const __m128i va${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) input_a)); 50 …const __m128i va${ABC[N:N+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (input_a + $… 52 __m128i va${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) input_a); 54 __m128i va${ABC[N:N+8]} = _mm_loadl_epi64((const __m128i*) (input_a + ${N})); 61 va${ABC[N:N+8]} = _mm_unpacklo_epi8(va${ABC[N:N+8]}, vzero); 64 va${ABC[N:N+8]} = _mm_srai_epi16(_mm_unpacklo_epi8(va${ABC[N:N+8]}, va${ABC[N:N+8]}), 8); 67 const __m128i vxa${ABC[N:N+8]} = _mm_sub_epi16(va${ABC[N:N+8]}, va_zero_point); 70 const __m128i vprod${ABC[N:N+8]}lo = _mm_mullo_epi16(vxa${ABC[N:N+8]}, vxb); 71 const __m128i vprod${ABC[N:N+8]}hi = _mm_mulhi_epi16(vxa${ABC[N:N+8]}, vxb); [all …]
|
D | wasmsimd-mul32-ld64.c.in | 10 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 45 const v128_t va${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(input_a); 47 const v128_t va${ABC[N:N+8]} = ${WASM_X16X8_LOAD8X8}(input_a + ${N}); 51 const v128_t vxa${ABC[N:N+8]} = wasm_i16x8_sub(va${ABC[N:N+8]}, va_zero_point); 54 … v128_t vacc${ABC[N:N+4]} = wasm_i32x4_mul(wasm_i32x4_extend_low_i16x8(vxa${ABC[N:N+8]}), vxblo); 55 …v128_t vacc${ABC[N+4:N+8]} = wasm_i32x4_mul(wasm_i32x4_extend_high_i16x8(vxa${ABC[N:N+8]}), vxbhi); 58 vacc${ABC[N:N+4]} = wasm_f32x4_convert_i32x4(vacc${ABC[N:N+4]}); 61 vacc${ABC[N:N+4]} = wasm_f32x4_mul(vacc${ABC[N:N+4]}, vscale); 64 vacc${ABC[N:N+4]} = wasm_f32x4_add(vacc${ABC[N:N+4]}, vmagic_bias); 67 vacc${ABC[N:N+4]} = wasm_i32x4_max(vacc${ABC[N:N+4]}, vmagic_min); [all …]
|
/external/XNNPACK/src/qs8-vaddc/ |
D | sse-mul16-ld64.c.in | 12 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 47 const __m128i va${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) input_a)); 49 …const __m128i va${ABC[N:N+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (input_a + $… 51 __m128i va${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) input_a); 53 __m128i va${ABC[N:N+8]} = _mm_loadl_epi64((const __m128i*) (input_a + ${N})); 60 va${ABC[N:N+8]} = _mm_unpacklo_epi8(va${ABC[N:N+8]}, vzero); 63 va${ABC[N:N+8]} = _mm_srai_epi16(_mm_unpacklo_epi8(va${ABC[N:N+8]}, va${ABC[N:N+8]}), 8); 66 __m128i vaprod${ABC[N:N+8]}hi = _mm_mulhi_epu16(va${ABC[N:N+8]}, va_multiplier_lo); 67 const __m128i vaprod${ABC[N:N+8]}lo = _mm_mullo_epi16(va${ABC[N:N+8]}, va_multiplier_lo); 70 …vaprod${ABC[N:N+8]}hi = _mm_add_epi16(vaprod${ABC[N:N+8]}hi, _mm_mullo_epi16(va${ABC[N:N+8]}, va_m… [all …]
|
D | avx2-mul32-ld64.c.in | 9 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 43 … const __m256i va${ABC[0:8]} = ${_MM256_CVTEPX8_EPI32}(_mm_loadl_epi64((const __m128i*) input_a)); 45 …const __m256i va${ABC[N:N+8]} = ${_MM256_CVTEPX8_EPI32}(_mm_loadl_epi64((const __m128i*) (input_a … 49 …__m256i vacc${ABC[N:N+8]} = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va${ABC[N:N+8]}, va_multipl… 52 vacc${ABC[N:N+8]} = _mm256_sra_epi32(vacc${ABC[N:N+8]}, vshift); 56 …_m256i vout${ABC[N:N+4]}${ABC[N+8:N+12]}${ABC[N+4:N+8]}${ABC[N+12:N+16]} = _mm256_adds_epi16(_mm25… 58 …__m128i vout${ABC[N:N+8]} = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc${ABC[N:N+8]… 60 …__m128i vout${ABC[N:N+8]} = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc${ABC[N:N+8]… 64 …ABC[N:N+16]} = _mm_shuffle_epi32(${_MM_PACKXS_EPI16}(_mm256_castsi256_si128(vout${ABC[N:N+4]}${ABC… 66 …__m128i vout${ABC[N:N+8]}${ABC[N:N+8]} = ${_MM_PACKXS_EPI16}(vout${ABC[N:N+8]}, vout${ABC[N:N+8]}); [all …]
|